diff --git a/meson.build b/meson.build index d2eab192618..4afad99f68d 100644 --- a/meson.build +++ b/meson.build @@ -240,7 +240,7 @@ elif _vulkan_drivers.contains('all') _vulkan_drivers = ['amd', 'intel', 'intel_hasvk', 'swrast', 'freedreno', 'panfrost', 'virtio', 'broadcom', 'imagination-experimental', 'microsoft-experimental', - 'nouveau'] + 'nouveau', 'asahi'] endif with_intel_vk = _vulkan_drivers.contains('intel') @@ -255,6 +255,7 @@ with_imagination_vk = _vulkan_drivers.contains('imagination-experimental') with_imagination_srv = get_option('imagination-srv') with_microsoft_vk = _vulkan_drivers.contains('microsoft-experimental') with_nouveau_vk = _vulkan_drivers.contains('nouveau') +with_asahi_vk = _vulkan_drivers.contains('asahi') with_any_vk = _vulkan_drivers.length() != 0 if with_any_vk and host_machine.system() == 'windows' and meson.version().version_compare('< 1.3') @@ -850,7 +851,7 @@ if with_gallium_rusticl endif with_clover_spirv = with_gallium_clover and get_option('opencl-spirv') -with_clc = with_microsoft_clc or with_intel_clc or with_gallium_asahi or with_gallium_rusticl or with_clover_spirv +with_clc = with_microsoft_clc or with_intel_clc or with_gallium_asahi or with_asahi_vk or with_gallium_rusticl or with_clover_spirv dep_clc = null_dep if with_gallium_clover or with_clc diff --git a/meson_options.txt b/meson_options.txt index f8f4ec29513..ff669621267 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -228,7 +228,7 @@ option( value : ['auto'], choices : ['auto', 'amd', 'broadcom', 'freedreno', 'intel', 'intel_hasvk', 'panfrost', 'swrast', 'virtio', 'imagination-experimental', - 'microsoft-experimental', 'nouveau', 'all'], + 'microsoft-experimental', 'nouveau', 'asahi', 'all'], description : 'List of vulkan drivers to build. If this is set to auto ' + 'all drivers applicable to the target OS/architecture ' + 'will be built' diff --git a/src/.clang-format b/src/.clang-format index d13cd051cf4..142700a493c 100644 --- a/src/.clang-format +++ b/src/.clang-format @@ -186,6 +186,8 @@ ForEachMacros: # asahi - foreach_active - foreach_submitted + - hk_foreach_view + - hk_foreach_variant - AGX_BATCH_FOREACH_BO_HANDLE - agx_pack - agx_push diff --git a/src/asahi/meson.build b/src/asahi/meson.build index ac58326a822..c5f08ead519 100644 --- a/src/asahi/meson.build +++ b/src/asahi/meson.build @@ -6,7 +6,7 @@ inc_asahi = include_directories([ '.', 'layout', 'lib', 'genxml', 'compiler' ]) -if with_gallium_asahi +if with_gallium_asahi or with_asahi_vk subdir('layout') subdir('compiler') subdir('clc') @@ -14,6 +14,10 @@ if with_gallium_asahi subdir('lib') endif +if with_asahi_vk + subdir('vulkan') +endif + if with_tools.contains('drm-shim') subdir('drm-shim') endif diff --git a/src/asahi/vulkan/hk_buffer.c b/src/asahi/vulkan/hk_buffer.c new file mode 100644 index 00000000000..63bec5a0f70 --- /dev/null +++ b/src/asahi/vulkan/hk_buffer.c @@ -0,0 +1,286 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_buffer.h" + +#include "hk_device.h" +#include "hk_device_memory.h" +#include "hk_entrypoints.h" +#include "hk_physical_device.h" + +static uint32_t +hk_get_buffer_alignment(const struct hk_physical_device *pdev, + VkBufferUsageFlags2KHR usage_flags, + VkBufferCreateFlags create_flags) +{ + uint32_t alignment = 16; + + if (usage_flags & VK_BUFFER_USAGE_2_UNIFORM_BUFFER_BIT_KHR) + alignment = MAX2(alignment, HK_MIN_UBO_ALIGNMENT); + + if (usage_flags & VK_BUFFER_USAGE_2_STORAGE_BUFFER_BIT_KHR) + alignment = MAX2(alignment, HK_MIN_SSBO_ALIGNMENT); + + if (usage_flags & (VK_BUFFER_USAGE_2_UNIFORM_TEXEL_BUFFER_BIT_KHR | + VK_BUFFER_USAGE_2_STORAGE_TEXEL_BUFFER_BIT_KHR)) + alignment = MAX2(alignment, HK_MIN_TEXEL_BUFFER_ALIGNMENT); + + if (create_flags & (VK_BUFFER_CREATE_SPARSE_BINDING_BIT | + VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT)) + alignment = MAX2(alignment, 4096); + + return alignment; +} + +static uint64_t +hk_get_bda_replay_addr(const VkBufferCreateInfo *pCreateInfo) +{ + uint64_t addr = 0; + vk_foreach_struct_const(ext, pCreateInfo->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO: { + const VkBufferOpaqueCaptureAddressCreateInfo *bda = (void *)ext; + if (bda->opaqueCaptureAddress != 0) { +#ifdef NDEBUG + return bda->opaqueCaptureAddress; +#else + assert(addr == 0 || bda->opaqueCaptureAddress == addr); + addr = bda->opaqueCaptureAddress; +#endif + } + break; + } + + case VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_CREATE_INFO_EXT: { + const VkBufferDeviceAddressCreateInfoEXT *bda = (void *)ext; + if (bda->deviceAddress != 0) { +#ifdef NDEBUG + return bda->deviceAddress; +#else + assert(addr == 0 || bda->deviceAddress == addr); + addr = bda->deviceAddress; +#endif + } + break; + } + + default: + break; + } + } + + return addr; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateBuffer(VkDevice device, const VkBufferCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, VkBuffer *pBuffer) +{ + VK_FROM_HANDLE(hk_device, dev, device); + struct hk_buffer *buffer; + + if (pCreateInfo->size > HK_MAX_BUFFER_SIZE) + return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + buffer = + vk_buffer_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*buffer)); + if (!buffer) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + if (buffer->vk.size > 0 && + (buffer->vk.create_flags & + (VK_BUFFER_CREATE_SPARSE_BINDING_BIT | + VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT))) { + + unreachable("todo"); +#if 0 + const uint32_t alignment = + hk_get_buffer_alignment(hk_device_physical(dev), + buffer->vk.usage, + buffer->vk.create_flags); + assert(alignment >= 4096); + buffer->vma_size_B = align64(buffer->vk.size, alignment); + + const bool sparse_residency = + buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT; + const bool bda_capture_replay = + buffer->vk.create_flags & VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT; + + uint64_t bda_replay_addr = 0; + if (bda_capture_replay) + bda_replay_addr = hk_get_bda_replay_addr(pCreateInfo); + + buffer->addr = nouveau_ws_alloc_vma(dev->ws_dev, bda_replay_addr, + buffer->vma_size_B, + alignment, bda_capture_replay, + sparse_residency); +#endif + if (buffer->addr == 0) { + vk_buffer_destroy(&dev->vk, pAllocator, &buffer->vk); + return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Sparse VMA allocation failed"); + } + } + + *pBuffer = hk_buffer_to_handle(buffer); + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +hk_DestroyBuffer(VkDevice device, VkBuffer _buffer, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_buffer, buffer, _buffer); + + if (!buffer) + return; + + if (buffer->vma_size_B > 0) { + unreachable("todo"); +#if 0 + const bool sparse_residency = + buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT; + const bool bda_capture_replay = + buffer->vk.create_flags & + VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT; + + agx_bo_unbind_vma(dev->ws_dev, buffer->addr, buffer->vma_size_B); + nouveau_ws_free_vma(dev->ws_dev, buffer->addr, buffer->vma_size_B, + bda_capture_replay, sparse_residency); +#endif + } + + vk_buffer_destroy(&dev->vk, pAllocator, &buffer->vk); +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetDeviceBufferMemoryRequirements( + VkDevice device, const VkDeviceBufferMemoryRequirements *pInfo, + VkMemoryRequirements2 *pMemoryRequirements) +{ + VK_FROM_HANDLE(hk_device, dev, device); + struct hk_physical_device *pdev = hk_device_physical(dev); + + const uint32_t alignment = hk_get_buffer_alignment( + hk_device_physical(dev), pInfo->pCreateInfo->usage, + pInfo->pCreateInfo->flags); + + pMemoryRequirements->memoryRequirements = (VkMemoryRequirements){ + .size = align64(pInfo->pCreateInfo->size, alignment), + .alignment = alignment, + .memoryTypeBits = BITFIELD_MASK(pdev->mem_type_count), + }; + + vk_foreach_struct_const(ext, pMemoryRequirements->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: { + VkMemoryDedicatedRequirements *dedicated = (void *)ext; + dedicated->prefersDedicatedAllocation = false; + dedicated->requiresDedicatedAllocation = false; + break; + } + default: + vk_debug_ignored_stype(ext->sType); + break; + } + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetPhysicalDeviceExternalBufferProperties( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceExternalBufferInfo *pExternalBufferInfo, + VkExternalBufferProperties *pExternalBufferProperties) +{ + /* The Vulkan 1.3.256 spec says: + * + * VUID-VkPhysicalDeviceExternalBufferInfo-handleType-parameter + * + * "handleType must be a valid VkExternalMemoryHandleTypeFlagBits value" + * + * This differs from VkPhysicalDeviceExternalImageFormatInfo, which + * surprisingly permits handleType == 0. + */ + assert(pExternalBufferInfo->handleType != 0); + + /* All of the current flags are for sparse which we don't support yet. + * Even when we do support it, doing sparse on external memory sounds + * sketchy. Also, just disallowing flags is the safe option. + */ + if (pExternalBufferInfo->flags) + goto unsupported; + + switch (pExternalBufferInfo->handleType) { + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: + pExternalBufferProperties->externalMemoryProperties = + hk_dma_buf_mem_props; + return; + default: + goto unsupported; + } + +unsupported: + /* From the Vulkan 1.3.256 spec: + * + * compatibleHandleTypes must include at least handleType. + */ + pExternalBufferProperties->externalMemoryProperties = + (VkExternalMemoryProperties){ + .compatibleHandleTypes = pExternalBufferInfo->handleType, + }; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_BindBufferMemory2(VkDevice device, uint32_t bindInfoCount, + const VkBindBufferMemoryInfo *pBindInfos) +{ + for (uint32_t i = 0; i < bindInfoCount; ++i) { + VK_FROM_HANDLE(hk_device_memory, mem, pBindInfos[i].memory); + VK_FROM_HANDLE(hk_buffer, buffer, pBindInfos[i].buffer); + + if (buffer->vma_size_B) { + unreachable("todo"); +#if 0 + VK_FROM_HANDLE(hk_device, dev, device); + agx_bo_bind_vma(dev->ws_dev, + mem->bo, + buffer->addr, + buffer->vma_size_B, + pBindInfos[i].memoryOffset, + 0 /* pte_kind */); +#endif + } else { + buffer->addr = mem->bo->ptr.gpu + pBindInfos[i].memoryOffset; + } + + const VkBindMemoryStatusKHR *status = + vk_find_struct_const(pBindInfos[i].pNext, BIND_MEMORY_STATUS_KHR); + if (status != NULL && status->pResult != NULL) + *status->pResult = VK_SUCCESS; + } + return VK_SUCCESS; +} + +VKAPI_ATTR VkDeviceAddress VKAPI_CALL +hk_GetBufferDeviceAddress(UNUSED VkDevice device, + const VkBufferDeviceAddressInfo *pInfo) +{ + VK_FROM_HANDLE(hk_buffer, buffer, pInfo->buffer); + + return hk_buffer_address(buffer, 0); +} + +VKAPI_ATTR uint64_t VKAPI_CALL +hk_GetBufferOpaqueCaptureAddress(UNUSED VkDevice device, + const VkBufferDeviceAddressInfo *pInfo) +{ + VK_FROM_HANDLE(hk_buffer, buffer, pInfo->buffer); + + return hk_buffer_address(buffer, 0); +} diff --git a/src/asahi/vulkan/hk_buffer.h b/src/asahi/vulkan/hk_buffer.h new file mode 100644 index 00000000000..f349a3df0e2 --- /dev/null +++ b/src/asahi/vulkan/hk_buffer.h @@ -0,0 +1,45 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#pragma once + +#include "hk_device_memory.h" +#include "hk_private.h" + +#include "vk_buffer.h" + +struct hk_device_memory; +struct hk_physical_device; + +struct hk_buffer { + struct vk_buffer vk; + uint64_t addr; + + /** Size of the reserved VMA range for sparse buffers, zero otherwise. */ + uint64_t vma_size_B; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_buffer, vk.base, VkBuffer, + VK_OBJECT_TYPE_BUFFER) + +static inline uint64_t +hk_buffer_address(const struct hk_buffer *buffer, uint64_t offset) +{ + return buffer->addr + offset; +} + +static inline struct hk_addr_range +hk_buffer_addr_range(const struct hk_buffer *buffer, uint64_t offset, + uint64_t range) +{ + if (buffer == NULL) + return (struct hk_addr_range){.range = 0}; + + return (struct hk_addr_range){ + .addr = hk_buffer_address(buffer, offset), + .range = vk_buffer_range(&buffer->vk, offset, range), + }; +} diff --git a/src/asahi/vulkan/hk_buffer_view.c b/src/asahi/vulkan/hk_buffer_view.c new file mode 100644 index 00000000000..73d32d945ae --- /dev/null +++ b/src/asahi/vulkan/hk_buffer_view.c @@ -0,0 +1,195 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_buffer_view.h" +#include "asahi/lib/agx_formats.h" +#include "asahi/lib/agx_nir_lower_vbo.h" +#include "util/bitscan.h" +#include "util/format/u_format.h" +#include "util/format/u_formats.h" + +#include "agx_helpers.h" +#include "agx_nir_passes.h" +#include "agx_pack.h" +#include "hk_buffer.h" +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_physical_device.h" + +#include "vk_format.h" + +VkFormatFeatureFlags2 +hk_get_buffer_format_features(struct hk_physical_device *pdev, + VkFormat vk_format) +{ + VkFormatFeatureFlags2 features = 0; + enum pipe_format p_format = vk_format_to_pipe_format(vk_format); + + if (p_format == PIPE_FORMAT_NONE) + return 0; + + if (agx_vbo_supports_format(p_format)) + features |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT; + + if (agx_pixel_format[p_format].texturable && + !util_format_is_depth_or_stencil(p_format)) { + + features |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT; + + /* RGB32 specially supported for uniform texel buffers only. */ + if (util_is_power_of_two_nonzero(util_format_get_blocksize(p_format))) { + features |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT | + VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT; + } + + if (p_format == PIPE_FORMAT_R32_UINT || p_format == PIPE_FORMAT_R32_SINT) + features |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT; + } + + return features; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateBufferView(VkDevice _device, const VkBufferViewCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkBufferView *pBufferView) +{ + VK_FROM_HANDLE(hk_device, device, _device); + VK_FROM_HANDLE(hk_buffer, buffer, pCreateInfo->buffer); + struct hk_buffer_view *view; + VkResult result; + + view = vk_buffer_view_create(&device->vk, pCreateInfo, pAllocator, + sizeof(*view)); + if (!view) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + enum pipe_format format = vk_format_to_pipe_format(view->vk.format); + const struct util_format_description *desc = util_format_description(format); + + uint8_t format_swizzle[4] = { + desc->swizzle[0], + desc->swizzle[1], + desc->swizzle[2], + desc->swizzle[3], + }; + + if (util_format_is_depth_or_stencil(format)) { + assert(!util_format_is_depth_and_stencil(format) && + "separate stencil always used"); + + /* Broadcast depth and stencil */ + format_swizzle[0] = 0; + format_swizzle[1] = 0; + format_swizzle[2] = 0; + format_swizzle[3] = 0; + } + + /* Decompose the offset into a multiple of 16-bytes (which we can include in + * the address) and an extra texel-aligned tail offset of up to 15 bytes. + * + * This lets us offset partially in the shader instead, getting + * around alignment restrictions on the base address pointer. + */ + uint64_t base = hk_buffer_address(buffer, 0) + (view->vk.offset & ~0xf); + uint32_t tail_offset_B = view->vk.offset & 0xf; + uint32_t tail_offset_el = tail_offset_B / util_format_get_blocksize(format); + assert(tail_offset_el * util_format_get_blocksize(format) == tail_offset_B && + "must be texel aligned"); + + struct agx_texture_packed tex; + agx_pack(&tex, TEXTURE, cfg) { + cfg.dimension = AGX_TEXTURE_DIMENSION_2D; + cfg.layout = AGX_LAYOUT_LINEAR; + cfg.channels = agx_pixel_format[format].channels; + cfg.type = agx_pixel_format[format].type; + cfg.swizzle_r = agx_channel_from_pipe(format_swizzle[0]); + cfg.swizzle_g = agx_channel_from_pipe(format_swizzle[1]); + cfg.swizzle_b = agx_channel_from_pipe(format_swizzle[2]); + cfg.swizzle_a = agx_channel_from_pipe(format_swizzle[3]); + + cfg.width = AGX_TEXTURE_BUFFER_WIDTH; + cfg.height = DIV_ROUND_UP(view->vk.elements, cfg.width); + cfg.first_level = cfg.last_level = 0; + + cfg.address = base; + cfg.buffer_size_sw = view->vk.elements; + cfg.buffer_offset_sw = tail_offset_el; + + cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); + cfg.srgb_2_channel = cfg.srgb && util_format_colormask(desc) == 0x3; + + cfg.depth = 1; + cfg.stride = (cfg.width * util_format_get_blocksize(format)) - 16; + } + + struct agx_pbe_packed pbe; + agx_pack(&pbe, PBE, cfg) { + cfg.dimension = AGX_TEXTURE_DIMENSION_2D; + cfg.layout = AGX_LAYOUT_LINEAR; + cfg.channels = agx_pixel_format[format].channels; + cfg.type = agx_pixel_format[format].type; + cfg.srgb = util_format_is_srgb(format); + + assert(desc->nr_channels >= 1 && desc->nr_channels <= 4); + + for (unsigned i = 0; i < desc->nr_channels; ++i) { + if (desc->swizzle[i] == 0) + cfg.swizzle_r = i; + else if (desc->swizzle[i] == 1) + cfg.swizzle_g = i; + else if (desc->swizzle[i] == 2) + cfg.swizzle_b = i; + else if (desc->swizzle[i] == 3) + cfg.swizzle_a = i; + } + + cfg.buffer = base; + cfg.buffer_offset_sw = tail_offset_el; + + cfg.width = AGX_TEXTURE_BUFFER_WIDTH; + cfg.height = DIV_ROUND_UP(view->vk.elements, cfg.width); + cfg.level = 0; + cfg.stride = (cfg.width * util_format_get_blocksize(format)) - 4; + cfg.layers = 1; + cfg.levels = 1; + }; + + result = hk_descriptor_table_add(device, &device->images, &tex, sizeof(tex), + &view->tex_desc_index); + if (result != VK_SUCCESS) { + vk_buffer_view_destroy(&device->vk, pAllocator, &view->vk); + return result; + } + + result = hk_descriptor_table_add(device, &device->images, &pbe, sizeof(pbe), + &view->pbe_desc_index); + if (result != VK_SUCCESS) { + hk_descriptor_table_remove(device, &device->images, view->tex_desc_index); + vk_buffer_view_destroy(&device->vk, pAllocator, &view->vk); + return result; + } + + *pBufferView = hk_buffer_view_to_handle(view); + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +hk_DestroyBufferView(VkDevice _device, VkBufferView bufferView, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_device, device, _device); + VK_FROM_HANDLE(hk_buffer_view, view, bufferView); + + if (!view) + return; + + hk_descriptor_table_remove(device, &device->images, view->tex_desc_index); + hk_descriptor_table_remove(device, &device->images, view->pbe_desc_index); + + vk_buffer_view_destroy(&device->vk, pAllocator, &view->vk); +} diff --git a/src/asahi/vulkan/hk_buffer_view.h b/src/asahi/vulkan/hk_buffer_view.h new file mode 100644 index 00000000000..6b182006f1a --- /dev/null +++ b/src/asahi/vulkan/hk_buffer_view.h @@ -0,0 +1,27 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#pragma once + +#include "hk_private.h" + +#include "vk_buffer_view.h" + +struct hk_physical_device; + +VkFormatFeatureFlags2 +hk_get_buffer_format_features(struct hk_physical_device *pdevice, + VkFormat format); + +struct hk_buffer_view { + struct vk_buffer_view vk; + + /** Index in the image descriptor table */ + uint32_t tex_desc_index, pbe_desc_index; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_buffer_view, vk.base, VkBufferView, + VK_OBJECT_TYPE_BUFFER_VIEW) diff --git a/src/asahi/vulkan/hk_cmd_buffer.c b/src/asahi/vulkan/hk_cmd_buffer.c new file mode 100644 index 00000000000..b3b362bf2b7 --- /dev/null +++ b/src/asahi/vulkan/hk_cmd_buffer.c @@ -0,0 +1,811 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_cmd_buffer.h" + +#include "agx_bo.h" +#include "agx_linker.h" +#include "agx_tilebuffer.h" +#include "agx_usc.h" +#include "hk_buffer.h" +#include "hk_cmd_pool.h" +#include "hk_descriptor_set.h" +#include "hk_descriptor_set_layout.h" +#include "hk_device.h" +#include "hk_device_memory.h" +#include "hk_entrypoints.h" +#include "hk_image_view.h" +#include "hk_physical_device.h" +#include "hk_shader.h" + +#include "pool.h" +#include "shader_enums.h" +#include "vk_pipeline_layout.h" +#include "vk_synchronization.h" + +#include "nouveau/nouveau.h" +#include "util/list.h" +#include "util/macros.h" +#include "util/u_dynarray.h" +#include "vulkan/vulkan_core.h" + +static void +hk_descriptor_state_fini(struct hk_cmd_buffer *cmd, + struct hk_descriptor_state *desc) +{ + struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd); + + for (unsigned i = 0; i < HK_MAX_SETS; i++) { + vk_free(&pool->vk.alloc, desc->push[i]); + desc->push[i] = NULL; + } +} + +static void +hk_free_resettable_cmd_buffer(struct hk_cmd_buffer *cmd) +{ + struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd); + + hk_descriptor_state_fini(cmd, &cmd->state.gfx.descriptors); + hk_descriptor_state_fini(cmd, &cmd->state.cs.descriptors); + + hk_cmd_pool_free_bo_list(pool, &cmd->uploader.main.bos); + hk_cmd_pool_free_usc_bo_list(pool, &cmd->uploader.usc.bos); + + list_for_each_entry_safe(struct hk_cs, it, &cmd->control_streams, node) { + list_del(&it->node); + hk_cs_destroy(it); + } + + util_dynarray_foreach(&cmd->large_bos, struct agx_bo *, bo) { + agx_bo_unreference(*bo); + } + + util_dynarray_clear(&cmd->large_bos); +} + +static void +hk_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer) +{ + struct hk_cmd_buffer *cmd = + container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk); + struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd); + + hk_free_resettable_cmd_buffer(cmd); + vk_command_buffer_finish(&cmd->vk); + vk_free(&pool->vk.alloc, cmd); +} + +static VkResult +hk_create_cmd_buffer(struct vk_command_pool *vk_pool, + VkCommandBufferLevel level, + struct vk_command_buffer **cmd_buffer_out) +{ + struct hk_cmd_pool *pool = container_of(vk_pool, struct hk_cmd_pool, vk); + struct hk_device *dev = hk_cmd_pool_device(pool); + struct hk_cmd_buffer *cmd; + VkResult result; + + cmd = vk_zalloc(&pool->vk.alloc, sizeof(*cmd), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (cmd == NULL) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + result = + vk_command_buffer_init(&pool->vk, &cmd->vk, &hk_cmd_buffer_ops, level); + if (result != VK_SUCCESS) { + vk_free(&pool->vk.alloc, cmd); + return result; + } + + util_dynarray_init(&cmd->large_bos, NULL); + + cmd->vk.dynamic_graphics_state.vi = &cmd->state.gfx._dynamic_vi; + cmd->vk.dynamic_graphics_state.ms.sample_locations = + &cmd->state.gfx._dynamic_sl; + + list_inithead(&cmd->uploader.main.bos); + list_inithead(&cmd->uploader.usc.bos); + list_inithead(&cmd->control_streams); + + *cmd_buffer_out = &cmd->vk; + + return VK_SUCCESS; +} + +static void +hk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, + UNUSED VkCommandBufferResetFlags flags) +{ + struct hk_cmd_buffer *cmd = + container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk); + + vk_command_buffer_reset(&cmd->vk); + hk_free_resettable_cmd_buffer(cmd); + + cmd->uploader.main.map = NULL; + cmd->uploader.main.base = 0; + cmd->uploader.main.offset = 0; + cmd->uploader.usc.map = NULL; + cmd->uploader.usc.base = 0; + cmd->uploader.usc.offset = 0; + + cmd->current_cs.gfx = NULL; + cmd->current_cs.cs = NULL; + cmd->current_cs.post_gfx = NULL; + cmd->current_cs.pre_gfx = NULL; + + /* TODO: clear pool! */ + + memset(&cmd->state, 0, sizeof(cmd->state)); +} + +const struct vk_command_buffer_ops hk_cmd_buffer_ops = { + .create = hk_create_cmd_buffer, + .reset = hk_reset_cmd_buffer, + .destroy = hk_destroy_cmd_buffer, +}; + +static VkResult +hk_cmd_buffer_alloc_bo(struct hk_cmd_buffer *cmd, bool usc, + struct hk_cmd_bo **bo_out) +{ + VkResult result = hk_cmd_pool_alloc_bo(hk_cmd_buffer_pool(cmd), usc, bo_out); + if (result != VK_SUCCESS) + return result; + + if (usc) + list_addtail(&(*bo_out)->link, &cmd->uploader.usc.bos); + else + list_addtail(&(*bo_out)->link, &cmd->uploader.main.bos); + + return VK_SUCCESS; +} + +struct agx_ptr +hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size, + uint32_t alignment, bool usc) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + struct hk_uploader *uploader = + usc ? &cmd->uploader.usc : &cmd->uploader.main; + + /* Specially handle large allocations owned by the command buffer, e.g. used + * for statically allocated vertex output buffers with geometry shaders. + */ + if (size > HK_CMD_BO_SIZE) { + uint32_t flags = usc ? AGX_BO_LOW_VA : 0; + struct agx_bo *bo = + agx_bo_create(&dev->dev, size, flags, "Large pool allocation"); + + util_dynarray_append(&cmd->large_bos, struct agx_bo *, bo); + return bo->ptr; + } + + assert(size <= HK_CMD_BO_SIZE); + assert(alignment > 0); + + uint32_t offset = align(uploader->offset, alignment); + + assert(offset <= HK_CMD_BO_SIZE); + if (uploader->map != NULL && size <= HK_CMD_BO_SIZE - offset) { + uploader->offset = offset + size; + + return (struct agx_ptr){ + .gpu = uploader->base + offset, + .cpu = uploader->map + offset, + }; + } + + struct hk_cmd_bo *bo; + VkResult result = hk_cmd_buffer_alloc_bo(cmd, usc, &bo); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(&cmd->vk, result); + return (struct agx_ptr){0}; + } + + /* Pick whichever of the current upload BO and the new BO will have more + * room left to be the BO for the next upload. If our upload size is + * bigger than the old offset, we're better off burning the whole new + * upload BO on this one allocation and continuing on the current upload + * BO. + */ + if (uploader->map == NULL || size < uploader->offset) { + uploader->map = bo->bo->ptr.cpu; + uploader->base = bo->bo->ptr.gpu; + uploader->offset = size; + } + + return (struct agx_ptr){ + .gpu = bo->bo->ptr.gpu, + .cpu = bo->map, + }; +} + +uint64_t +hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data, uint32_t size, + uint32_t alignment) +{ + struct agx_ptr T = hk_pool_alloc(cmd, size, alignment); + if (unlikely(T.cpu == NULL)) + return 0; + + memcpy(T.cpu, data, size); + return T.gpu; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_BeginCommandBuffer(VkCommandBuffer commandBuffer, + const VkCommandBufferBeginInfo *pBeginInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + + hk_reset_cmd_buffer(&cmd->vk, 0); + + hk_cmd_buffer_begin_compute(cmd, pBeginInfo); + hk_cmd_buffer_begin_graphics(cmd, pBeginInfo); + + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_EndCommandBuffer(VkCommandBuffer commandBuffer) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + + assert(cmd->current_cs.gfx == NULL && cmd->current_cs.pre_gfx == NULL && + "must end rendering before ending the command buffer"); + + hk_cmd_buffer_end_compute(cmd); + hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx); + + return vk_command_buffer_get_record_result(&cmd->vk); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, + const VkDependencyInfo *pDependencyInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + + /* The big hammer. We end both compute and graphics batches. Ending compute + * here is necessary to properly handle graphics->compute dependencies. + * + * XXX: perf. */ + hk_cmd_buffer_end_compute(cmd); + hk_cmd_buffer_end_graphics(cmd); +} + +void +hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count, + const gl_shader_stage *stages, + struct vk_shader **const shaders) +{ + struct hk_cmd_buffer *cmd = container_of(vk_cmd, struct hk_cmd_buffer, vk); + + for (uint32_t i = 0; i < stage_count; i++) { + struct hk_api_shader *shader = + container_of(shaders[i], struct hk_api_shader, vk); + + if (stages[i] == MESA_SHADER_COMPUTE || stages[i] == MESA_SHADER_KERNEL) + hk_cmd_bind_compute_shader(cmd, shader); + else + hk_cmd_bind_graphics_shader(cmd, stages[i], shader); + } +} + +static void +hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd, + struct hk_descriptor_state *desc, + const VkBindDescriptorSetsInfoKHR *info) +{ + VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout); + + /* Fro the Vulkan 1.3.275 spec: + * + * "When binding a descriptor set (see Descriptor Set Binding) to + * set number N... + * + * If, additionally, the previously bound descriptor set for set + * N was bound using a pipeline layout not compatible for set N, + * then all bindings in sets numbered greater than N are + * disturbed." + * + * This means that, if some earlier set gets bound in such a way that + * it changes set_dynamic_buffer_start[s], this binding is implicitly + * invalidated. Therefore, we can always look at the current value + * of set_dynamic_buffer_start[s] as the base of our dynamic buffer + * range and it's only our responsibility to adjust all + * set_dynamic_buffer_start[p] for p > s as needed. + */ + uint8_t dyn_buffer_start = + desc->root.set_dynamic_buffer_start[info->firstSet]; + + uint32_t next_dyn_offset = 0; + for (uint32_t i = 0; i < info->descriptorSetCount; ++i) { + unsigned s = i + info->firstSet; + VK_FROM_HANDLE(hk_descriptor_set, set, info->pDescriptorSets[i]); + + if (desc->sets[s] != set) { + if (set != NULL) { + desc->root.sets[s] = hk_descriptor_set_addr(set); + desc->set_sizes[s] = set->size; + } else { + desc->root.sets[s] = 0; + desc->set_sizes[s] = 0; + } + desc->sets[s] = set; + desc->sets_dirty |= BITFIELD_BIT(s); + + /* Binding descriptors invalidates push descriptors */ + desc->push_dirty &= ~BITFIELD_BIT(s); + } + + desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start; + + if (pipeline_layout->set_layouts[s] != NULL) { + const struct hk_descriptor_set_layout *set_layout = + vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[s]); + + if (set != NULL && set_layout->dynamic_buffer_count > 0) { + for (uint32_t j = 0; j < set_layout->dynamic_buffer_count; j++) { + struct hk_buffer_address addr = set->dynamic_buffers[j]; + addr.base_addr += info->pDynamicOffsets[next_dyn_offset + j]; + desc->root.dynamic_buffers[dyn_buffer_start + j] = addr; + } + next_dyn_offset += set->layout->dynamic_buffer_count; + } + + dyn_buffer_start += set_layout->dynamic_buffer_count; + } else { + assert(set == NULL); + } + } + assert(dyn_buffer_start <= HK_MAX_DYNAMIC_BUFFERS); + assert(next_dyn_offset <= info->dynamicOffsetCount); + + for (uint32_t s = info->firstSet + info->descriptorSetCount; s < HK_MAX_SETS; + s++) + desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start; + + desc->root_dirty = true; +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdBindDescriptorSets2KHR( + VkCommandBuffer commandBuffer, + const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + + if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) { + hk_bind_descriptor_sets(cmd, &cmd->state.gfx.descriptors, + pBindDescriptorSetsInfo); + } + + if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) { + hk_bind_descriptor_sets(cmd, &cmd->state.cs.descriptors, + pBindDescriptorSetsInfo); + } +} + +static void +hk_push_constants(UNUSED struct hk_cmd_buffer *cmd, + struct hk_descriptor_state *desc, + const VkPushConstantsInfoKHR *info) +{ + memcpy(desc->root.push + info->offset, info->pValues, info->size); + desc->root_dirty = true; +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdPushConstants2KHR(VkCommandBuffer commandBuffer, + const VkPushConstantsInfoKHR *pPushConstantsInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + + if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) + hk_push_constants(cmd, &cmd->state.gfx.descriptors, pPushConstantsInfo); + + if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) + hk_push_constants(cmd, &cmd->state.cs.descriptors, pPushConstantsInfo); +} + +static struct hk_push_descriptor_set * +hk_cmd_push_descriptors(struct hk_cmd_buffer *cmd, + struct hk_descriptor_state *desc, uint32_t set) +{ + assert(set < HK_MAX_SETS); + if (unlikely(desc->push[set] == NULL)) { + desc->push[set] = + vk_zalloc(&cmd->vk.pool->alloc, sizeof(*desc->push[set]), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (unlikely(desc->push[set] == NULL)) { + vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY); + return NULL; + } + } + + /* Pushing descriptors replaces whatever sets are bound */ + desc->sets[set] = NULL; + desc->push_dirty |= BITFIELD_BIT(set); + + return desc->push[set]; +} + +static void +hk_push_descriptor_set(struct hk_cmd_buffer *cmd, + struct hk_descriptor_state *desc, + const VkPushDescriptorSetInfoKHR *info) +{ + VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout); + + struct hk_push_descriptor_set *push_set = + hk_cmd_push_descriptors(cmd, desc, info->set); + if (unlikely(push_set == NULL)) + return; + + struct hk_descriptor_set_layout *set_layout = + vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[info->set]); + + hk_push_descriptor_set_update(push_set, set_layout, + info->descriptorWriteCount, + info->pDescriptorWrites); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdPushDescriptorSet2KHR( + VkCommandBuffer commandBuffer, + const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + + if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) { + hk_push_descriptor_set(cmd, &cmd->state.gfx.descriptors, + pPushDescriptorSetInfo); + } + + if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) { + hk_push_descriptor_set(cmd, &cmd->state.cs.descriptors, + pPushDescriptorSetInfo); + } +} + +void +hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd, + struct hk_descriptor_state *desc) +{ + u_foreach_bit(set_idx, desc->push_dirty) { + struct hk_push_descriptor_set *push_set = desc->push[set_idx]; + uint64_t push_set_addr = hk_pool_upload( + cmd, push_set->data, sizeof(push_set->data), HK_MIN_UBO_ALIGNMENT); + + desc->root.sets[set_idx] = push_set_addr; + desc->set_sizes[set_idx] = sizeof(push_set->data); + } + + desc->root_dirty = true; + desc->push_dirty = 0; +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdPushDescriptorSetWithTemplate2KHR( + VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR + *pPushDescriptorSetWithTemplateInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(vk_descriptor_update_template, template, + pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate); + VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, + pPushDescriptorSetWithTemplateInfo->layout); + + struct hk_descriptor_state *desc = + hk_get_descriptors_state(cmd, template->bind_point); + struct hk_push_descriptor_set *push_set = hk_cmd_push_descriptors( + cmd, desc, pPushDescriptorSetWithTemplateInfo->set); + if (unlikely(push_set == NULL)) + return; + + struct hk_descriptor_set_layout *set_layout = vk_to_hk_descriptor_set_layout( + pipeline_layout->set_layouts[pPushDescriptorSetWithTemplateInfo->set]); + + hk_push_descriptor_set_update_template( + push_set, set_layout, template, + pPushDescriptorSetWithTemplateInfo->pData); +} + +uint64_t +hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd, + VkPipelineBindPoint bind_point) +{ + struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point); + struct hk_root_descriptor_table *root = &desc->root; + + struct agx_ptr root_ptr = hk_pool_alloc(cmd, sizeof(*root), 8); + if (!root_ptr.cpu) + return 0; + + root->root_desc_addr = root_ptr.gpu; + + memcpy(root_ptr.cpu, root, sizeof(*root)); + return root_ptr.gpu; +} + +void +hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b, + struct hk_cmd_buffer *cmd) +{ + struct hk_rendering_state *render = &cmd->state.gfx.render; + + /* Upload texture/PBE descriptors for each render target so we can clear + * spilled render targets. + */ + struct agx_ptr descs = + hk_pool_alloc(cmd, AGX_TEXTURE_LENGTH * 2 * render->color_att_count, 64); + struct agx_texture_packed *desc = descs.cpu; + if (!desc) + return; + + for (unsigned i = 0; i < render->color_att_count; ++i) { + struct hk_image_view *iview = render->color_att[i].iview; + if (!iview) { + /* XXX: probably should emit a null descriptor here...? */ + continue; + } + + memcpy(&desc[(i * 2) + 0], &iview->planes[0].emrt_texture, sizeof(*desc)); + memcpy(&desc[(i * 2) + 1], &iview->planes[0].emrt_pbe, sizeof(*desc)); + } + + desc = descs.cpu; + + /* Bind the base as u0_u1 for bindless access */ + agx_usc_uniform(b, 0, 4, hk_pool_upload(cmd, &descs.gpu, 8, 8)); +} + +void +hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs, + struct hk_shader *s) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + uint32_t max_scratch_size = + MAX2(s->b.info.scratch_size, s->b.info.preamble_scratch_size); + + if (max_scratch_size == 0) + return; + + unsigned preamble_size = (s->b.info.preamble_scratch_size > 0) ? 1 : 0; + + /* XXX: need to lock around agx_scratch_alloc... */ + /* Note: this uses the hardware stage, not the software stage */ + switch (s->b.info.stage) { + case PIPE_SHADER_FRAGMENT: + agx_scratch_alloc(&dev->scratch.fs, max_scratch_size, 0); + cs->scratch.fs.main = true; + cs->scratch.fs.preamble = MAX2(cs->scratch.fs.preamble, preamble_size); + break; + case PIPE_SHADER_VERTEX: + agx_scratch_alloc(&dev->scratch.vs, max_scratch_size, 0); + cs->scratch.vs.main = true; + cs->scratch.vs.preamble = MAX2(cs->scratch.vs.preamble, preamble_size); + break; + default: + agx_scratch_alloc(&dev->scratch.cs, max_scratch_size, 0); + cs->scratch.cs.main = true; + cs->scratch.cs.preamble = MAX2(cs->scratch.cs.preamble, preamble_size); + break; + } +} + +uint32_t +hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s, + struct hk_linked_shader *linked) +{ + enum pipe_shader_type sw_stage = s->info.stage; + enum pipe_shader_type hw_stage = s->b.info.stage; + + unsigned constant_push_ranges = + DIV_ROUND_UP(s->b.info.immediate_size_16, 64); + unsigned push_ranges = 2; + unsigned stage_ranges = 3; + + size_t usc_size = + agx_usc_size(constant_push_ranges + push_ranges + stage_ranges + 4); + struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64); + if (!t.cpu) + return 0; + + struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size); + + uint64_t root_ptr; + + if (sw_stage == PIPE_SHADER_COMPUTE) + root_ptr = hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_COMPUTE); + else + root_ptr = cmd->state.gfx.root; + + static_assert(offsetof(struct hk_root_descriptor_table, root_desc_addr) == 0, + "self-reflective"); + + agx_usc_uniform(&b, HK_ROOT_UNIFORM, 4, root_ptr); + + if (sw_stage == MESA_SHADER_VERTEX) { + unsigned count = + DIV_ROUND_UP(BITSET_LAST_BIT(s->info.vs.attrib_components_read), 4); + + if (count) { + agx_usc_uniform( + &b, 0, 4 * count, + root_ptr + hk_root_descriptor_offset(draw.attrib_base)); + + agx_usc_uniform( + &b, 4 * count, 2 * count, + root_ptr + hk_root_descriptor_offset(draw.attrib_clamps)); + } + + if (cmd->state.gfx.draw_params) + agx_usc_uniform(&b, 6 * count, 4, cmd->state.gfx.draw_params); + + if (cmd->state.gfx.draw_id_ptr) + agx_usc_uniform(&b, (6 * count) + 4, 1, cmd->state.gfx.draw_id_ptr); + + if (hw_stage == MESA_SHADER_COMPUTE) { + agx_usc_uniform( + &b, (6 * count) + 8, 4, + root_ptr + hk_root_descriptor_offset(draw.input_assembly)); + } + } else if (sw_stage == MESA_SHADER_FRAGMENT) { + if (agx_tilebuffer_spills(&cmd->state.gfx.render.tilebuffer)) { + hk_usc_upload_spilled_rt_descs(&b, cmd); + } + + agx_usc_uniform( + &b, 4, 8, root_ptr + hk_root_descriptor_offset(draw.blend_constant)); + + /* The SHARED state is baked into linked->usc for non-fragment shaders. We + * don't pass around the information to bake the tilebuffer layout. + * + * TODO: We probably could with some refactor. + */ + agx_usc_push_packed(&b, SHARED, &cmd->state.gfx.render.tilebuffer.usc); + } + + agx_usc_push_blob(&b, linked->usc.data, linked->usc.size); + return t.gpu; +} + +/* Specialized variant of hk_upload_usc_words for internal dispatches that do + * not use any state except for some directly mapped uniforms. + */ +uint32_t +hk_upload_usc_words_kernel(struct hk_cmd_buffer *cmd, struct hk_shader *s, + void *data, size_t data_size) +{ + assert(s->info.stage == MESA_SHADER_COMPUTE); + assert(s->b.info.scratch_size == 0 && "you shouldn't be spilling!"); + assert(s->b.info.preamble_scratch_size == 0 && "you shouldn't be spilling!"); + + unsigned constant_push_ranges = + DIV_ROUND_UP(s->b.info.immediate_size_16, 64); + + size_t usc_size = agx_usc_size(constant_push_ranges + 7); + struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64); + if (!t.cpu) + return 0; + + struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size); + + /* Map the data directly as uniforms starting at u0 */ + agx_usc_uniform(&b, 0, DIV_ROUND_UP(data_size, 2), + hk_pool_upload(cmd, data, data_size, 4)); + + agx_usc_push_blob(&b, s->only_linked->usc.data, s->only_linked->usc.size); + return t.gpu; +} + +void +hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs) +{ + struct hk_rendering_state *render = &cmd->state.gfx.render; + uint8_t *map = cs->current; + + cs->tib = render->tilebuffer; + + /* Assume this is not the first control stream of the render pass, so + * initially use the partial background program and ZLS control. + * hk_BeginRendering will override. + */ + cs->cr = render->cr; + cs->cr.bg.main = render->cr.bg.partial; + cs->cr.zls_control = render->cr.zls_control_partial; + + /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back + * with another that caused stale data to be cached and the CPU wrote to it + * in the meantime. + */ + agx_push(map, VDM_BARRIER, cfg) { + cfg.usc_cache_inval = true; + } + + struct AGX_PPP_HEADER present = { + .w_clamp = true, + .occlusion_query_2 = true, + .output_unknown = true, + .varying_word_2 = true, + .viewport_count = 1, /* irrelevant */ + }; + + size_t size = agx_ppp_update_size(&present); + struct agx_ptr T = hk_pool_alloc(cmd, size, 64); + if (!T.cpu) + return; + + struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present); + + /* clang-format off */ + agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10; + agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg); + agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg); + agx_ppp_push(&ppp, VARYING_2, cfg); + /* clang-format on */ + + agx_ppp_fini(&map, &ppp); + cs->current = map; + + util_dynarray_init(&cs->scissor, NULL); + util_dynarray_init(&cs->depth_bias, NULL); + + /* All graphics state must be reemited in each control stream */ + hk_cmd_buffer_dirty_all(cmd); +} + +void +hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs, + size_t space) +{ + bool vdm = cs->type == HK_CS_VDM; + + size_t link_length = + vdm ? AGX_VDM_STREAM_LINK_LENGTH : AGX_CDM_STREAM_LINK_LENGTH; + + /* Assert that we have space for a link tag */ + assert((cs->current + link_length) <= cs->end && "Encoder overflowed"); + + /* Always leave room for a link tag, in case we run out of space later, + * plus padding because VDM apparently overreads? + * + * 0x200 is not enough. 0x400 seems to work. 0x800 for safety. + */ + space += link_length + 0x800; + + /* If there is room in the command buffer, we're done */ + if (likely((cs->end - cs->current) >= space)) + return; + + /* Otherwise, we need to allocate a new command buffer. We use memory owned + * by the batch to simplify lifetime management for the BO. + */ + size_t size = 65536; + struct agx_ptr T = hk_pool_alloc(cmd, size, 256); + + /* Jump from the old control stream to the new control stream */ + if (vdm) { + agx_pack(cs->current, VDM_STREAM_LINK, cfg) { + cfg.target_lo = T.gpu & BITFIELD_MASK(32); + cfg.target_hi = T.gpu >> 32; + } + } else { + agx_pack(cs->current, CDM_STREAM_LINK, cfg) { + cfg.target_lo = T.gpu & BITFIELD_MASK(32); + cfg.target_hi = T.gpu >> 32; + } + } + + /* Swap out the control stream */ + cs->current = T.cpu; + cs->end = cs->current + size; + cs->stream_linked = true; +} diff --git a/src/asahi/vulkan/hk_cmd_buffer.h b/src/asahi/vulkan/hk_cmd_buffer.h new file mode 100644 index 00000000000..0b93f0a924f --- /dev/null +++ b/src/asahi/vulkan/hk_cmd_buffer.h @@ -0,0 +1,767 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "util/macros.h" + +#include "util/list.h" +#include "agx_helpers.h" +#include "agx_linker.h" +#include "agx_pack.h" +#include "agx_tilebuffer.h" +#include "agx_uvs.h" +#include "pool.h" +#include "shader_enums.h" + +#include "hk_private.h" +#include "hk_shader.h" + +#include "hk_cmd_pool.h" +#include "hk_descriptor_set.h" + +#include "asahi/lib/agx_nir_lower_vbo.h" +#include "util/u_dynarray.h" +#include "vulkan/vulkan_core.h" + +#include "vk_command_buffer.h" + +#include + +struct hk_buffer; +struct hk_cmd_bo; +struct hk_cmd_pool; +struct hk_image_view; +struct hk_push_descriptor_set; +struct hk_shader; +struct hk_linked_shader; +struct agx_usc_builder; +struct vk_shader; + +/** Root descriptor table. */ +struct hk_root_descriptor_table { + uint64_t root_desc_addr; + + union { + struct { + uint32_t view_index; + uint32_t ppp_multisamplectl; + + /* Vertex input state */ + uint64_t attrib_base[AGX_MAX_VBUFS]; + uint32_t attrib_clamps[AGX_MAX_VBUFS]; + + /* Pointer to the VS->TCS, VS->GS, or TES->GS buffer. */ + uint64_t vertex_output_buffer; + + /* Mask of outputs flowing VS->TCS, VS->GS, or TES->GS . */ + uint64_t vertex_outputs; + + /* Address of input assembly buffer if geom/tess is used, else 0 */ + uint64_t input_assembly; + + /* Address of tessellation param buffer if tessellation used, else 0 */ + uint64_t tess_params; + + /* Address of geometry param buffer if GS is used, else 0 */ + uint64_t geometry_params; + + /* Pipeline statistics queries. This is a base address with flags. */ + uint64_t pipeline_stats; + VkQueryPipelineStatisticFlags pipeline_stats_flags; + + float blend_constant[4]; + uint16_t no_epilog_discard; + uint16_t _pad1; + uint16_t api_sample_mask; + uint16_t _pad2; + uint16_t force_never_in_shader; + uint16_t _pad3; + uint16_t provoking; + uint16_t _pad4; + + /* Mapping from varying slots written by the last vertex stage to UVS + * indices. This mapping must be compatible with the fragment shader. + */ + uint8_t uvs_index[VARYING_SLOT_MAX]; + } draw; + struct { + uint64_t group_count_addr; + uint32_t base_group[3]; + } cs; + }; + + /* Client push constants */ + uint8_t push[HK_MAX_PUSH_SIZE]; + + /* Descriptor set base addresses */ + uint64_t sets[HK_MAX_SETS]; + + /* Dynamic buffer bindings */ + struct hk_buffer_address dynamic_buffers[HK_MAX_DYNAMIC_BUFFERS]; + + /* Start index in dynamic_buffers where each set starts */ + uint8_t set_dynamic_buffer_start[HK_MAX_SETS]; +}; + +/* helper macro for computing root descriptor byte offsets */ +#define hk_root_descriptor_offset(member) \ + offsetof(struct hk_root_descriptor_table, member) + +struct hk_descriptor_state { + bool root_dirty; + struct hk_root_descriptor_table root; + + uint32_t set_sizes[HK_MAX_SETS]; + struct hk_descriptor_set *sets[HK_MAX_SETS]; + uint32_t sets_dirty; + + struct hk_push_descriptor_set *push[HK_MAX_SETS]; + uint32_t push_dirty; +}; + +struct hk_attachment { + VkFormat vk_format; + struct hk_image_view *iview; + + VkResolveModeFlagBits resolve_mode; + struct hk_image_view *resolve_iview; +}; + +struct hk_bg_eot { + uint64_t usc; + struct agx_counts_packed counts; +}; + +struct hk_render_registers { + uint32_t width, height, layers; + uint32_t isp_bgobjdepth; + uint32_t isp_bgobjvals; + struct agx_zls_control_packed zls_control, zls_control_partial; + uint32_t iogpu_unk_214; + uint32_t depth_dimensions; + + struct { + uint32_t dimensions; + uint64_t buffer, meta; + uint32_t stride, meta_stride; + } depth; + + struct { + uint64_t buffer, meta; + uint32_t stride, meta_stride; + } stencil; + + struct { + struct hk_bg_eot main; + struct hk_bg_eot partial; + } bg; + + struct { + struct hk_bg_eot main; + struct hk_bg_eot partial; + } eot; +}; + +struct hk_rendering_state { + VkRenderingFlagBits flags; + + VkRect2D area; + uint32_t layer_count; + uint32_t view_mask; + + uint32_t color_att_count; + struct hk_attachment color_att[HK_MAX_RTS]; + struct hk_attachment depth_att; + struct hk_attachment stencil_att; + + struct agx_tilebuffer_layout tilebuffer; + struct hk_render_registers cr; +}; + +struct hk_index_buffer_state { + struct hk_addr_range buffer; + enum agx_index_size size; + uint32_t restart; +}; + +/* Dirty tracking bits for state not tracked by vk_dynamic_graphics_state or + * shaders_dirty. + */ +enum hk_dirty { + HK_DIRTY_INDEX = BITFIELD_BIT(0), + HK_DIRTY_VB = BITFIELD_BIT(1), + HK_DIRTY_OCCLUSION = BITFIELD_BIT(2), + HK_DIRTY_PROVOKING = BITFIELD_BIT(3), + HK_DIRTY_VARYINGS = BITFIELD_BIT(4), +}; + +struct hk_graphics_state { + struct hk_rendering_state render; + struct hk_descriptor_state descriptors; + + enum hk_dirty dirty; + + uint64_t root; + uint64_t draw_params; + uint64_t draw_id_ptr; + + uint32_t shaders_dirty; + struct hk_api_shader *shaders[MESA_SHADER_MESH + 1]; + + /* Vertex buffers */ + struct hk_addr_range vb[AGX_MAX_VBUFS]; + + /* Transform feedback buffers */ + struct hk_addr_range xfb[4]; + + /* Is transform feedback enabled? */ + bool xfb_enabled; + + /* Internal transform feedback offset vec4. + * + * TODO: Strictly could be global. + */ + uint64_t xfb_offsets; + + /* Pointer to the GPU memory backing active transform feedback queries, + * per-stream. Zero if no query is bound. + */ + uint64_t xfb_query[4]; + + struct hk_index_buffer_state index; + enum agx_primitive topology; + enum agx_object_type object_type; + + /* Provoking vertex 0, 1, or 2. Usually 0 or 2 for FIRST/LAST. 1 can only be + * set for tri fans. + */ + uint8_t provoking; + + struct { + enum agx_visibility_mode mode; + + /* If enabled, index of the current occlusion query in the occlusion heap. + * There can only be one active at a time (hardware contraint). + */ + uint16_t index; + } occlusion; + + /* Fast linked shader data structures */ + uint64_t varyings; + struct agx_varyings_vs linked_varyings; + + uint32_t linked_dirty; + struct hk_linked_shader *linked[PIPE_SHADER_TYPES]; + bool generate_primitive_id; + + /* Tessellation state */ + uint64_t tess_out_draws; + + /* Needed by vk_command_buffer::dynamic_graphics_state */ + struct vk_vertex_input_state _dynamic_vi; + struct vk_sample_locations_state _dynamic_sl; +}; + +struct hk_compute_state { + struct hk_descriptor_state descriptors; + struct hk_api_shader *shader; +}; + +struct hk_cmd_push { + void *map; + uint64_t addr; + uint32_t range; + bool no_prefetch; +}; + +struct hk_scratch_req { + bool main; + bool preamble; +}; + +/* + * hk_cs represents a single control stream, to be enqueued either to the + * CDM or VDM for compute/3D respectively. + */ +enum hk_cs_type { + HK_CS_CDM, + HK_CS_VDM, +}; + +struct hk_cs { + struct list_head node; + + /* Data master */ + enum hk_cs_type type; + + /* Address of the root control stream for the job */ + uint64_t addr; + + /* Start pointer of the root control stream */ + void *start; + + /* Current pointer within the control stream */ + void *current; + + /* End pointer of the current chunk of the control stream */ + void *end; + + /* Whether there is more than just the root chunk */ + bool stream_linked; + + /* Scratch requirements */ + struct { + union { + struct hk_scratch_req vs; + struct hk_scratch_req cs; + }; + + struct hk_scratch_req fs; + } scratch; + + /* Remaining state is for graphics only, ignored for compute */ + struct agx_tilebuffer_layout tib; + + struct util_dynarray scissor, depth_bias; + uint64_t uploaded_scissor, uploaded_zbias; + + /* We can only set ppp_multisamplectl once per batch. has_sample_locations + * tracks if we've committed to a set of sample locations yet. vk_meta + * operations do not set has_sample_locations since they don't care and it + * would interfere with the app-provided samples. + * + */ + bool has_sample_locations; + uint32_t ppp_multisamplectl; + + struct hk_render_registers cr; +}; + +struct hk_uploader { + /** List of hk_cmd_bo */ + struct list_head bos; + + /* Current addresses */ + uint8_t *map; + uint64_t base; + uint32_t offset; +}; + +struct hk_cmd_buffer { + struct vk_command_buffer vk; + + struct { + struct hk_graphics_state gfx; + struct hk_compute_state cs; + } state; + + struct { + struct hk_uploader main, usc; + } uploader; + + /* List of all recorded control streams */ + struct list_head control_streams; + + /* Current recorded control stream */ + struct { + /* VDM stream for 3D */ + struct hk_cs *gfx; + + /* CDM stream for compute */ + struct hk_cs *cs; + + /* CDM stream that executes immediately before the current graphics + * control stream. Used for geometry shading, tessellation, etc. + */ + struct hk_cs *pre_gfx; + + /* CDM stream that will execute after the current graphics control stream + * finishes. Used for queries. + */ + struct hk_cs *post_gfx; + } current_cs; + + /* Are we currently inside a vk_meta operation? This alters sample location + * behaviour. + */ + bool in_meta; + + /* XXX: move me? + * + * Indirect draw generated by the pre-GS for the geometry shader. + */ + uint64_t geom_indirect; + + /* Does the command buffer use the geometry heap? */ + bool uses_heap; + + /* Owned large BOs */ + struct util_dynarray large_bos; +}; + +VK_DEFINE_HANDLE_CASTS(hk_cmd_buffer, vk.base, VkCommandBuffer, + VK_OBJECT_TYPE_COMMAND_BUFFER) + +extern const struct vk_command_buffer_ops hk_cmd_buffer_ops; + +static inline struct hk_device * +hk_cmd_buffer_device(struct hk_cmd_buffer *cmd) +{ + return (struct hk_device *)cmd->vk.base.device; +} + +static inline struct hk_cmd_pool * +hk_cmd_buffer_pool(struct hk_cmd_buffer *cmd) +{ + return (struct hk_cmd_pool *)cmd->vk.pool; +} + +/* + * The hardware vertex shader is supplied by the last geometry stage. The + * geometry pipeline is vertex->tess->geometry so we search backwards. + */ +static inline struct hk_shader * +hk_bound_hw_vs(struct hk_graphics_state *gfx) +{ + struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX]; + struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL]; + struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY]; + + if (gs) + return &gs->variants[HK_GS_VARIANT_RAST]; + else if (tes) + return &tes->variants[HK_VS_VARIANT_HW]; + else + return &vs->variants[HK_VS_VARIANT_HW]; +} + +static inline struct hk_shader * +hk_bound_sw_vs(struct hk_graphics_state *gfx) +{ + struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX]; + struct hk_shader *hw_vs = hk_bound_hw_vs(gfx); + + if (hw_vs == &vs->variants[HK_VS_VARIANT_HW]) + return hw_vs; + else + return &vs->variants[HK_VS_VARIANT_SW]; +} + +static inline struct hk_shader * +hk_bound_sw_vs_before_gs(struct hk_graphics_state *gfx) +{ + struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX]; + struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL]; + struct hk_api_shader *api = tes ?: vs; + + return &api->variants[HK_VS_VARIANT_SW]; +} + +struct agx_ptr hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size, + uint32_t alignment, bool usc); + +uint64_t hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data, + uint32_t size, uint32_t alignment); + +static inline struct agx_ptr +hk_pool_alloc(struct hk_cmd_buffer *cmd, uint32_t size, uint32_t alignment) +{ + return hk_pool_alloc_internal(cmd, size, alignment, false); +} + +static inline struct agx_ptr +hk_pool_usc_alloc(struct hk_cmd_buffer *cmd, uint32_t size, uint32_t alignment) +{ + return hk_pool_alloc_internal(cmd, size, alignment, true); +} + +void hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs); +uint32_t hk_default_sample_positions(unsigned nr_samples); + +static inline struct hk_cs * +hk_cmd_buffer_get_cs_general(struct hk_cmd_buffer *cmd, struct hk_cs **ptr, + bool compute) +{ + if ((*ptr) == NULL) { + /* Allocate root control stream */ + size_t initial_size = 65536; + struct agx_ptr root = hk_pool_alloc(cmd, initial_size, 1024); + if (!root.cpu) + return NULL; + + /* Allocate hk_cs for the new stream */ + struct hk_cs *cs = malloc(sizeof(*cs)); + *cs = (struct hk_cs){ + .type = compute ? HK_CS_CDM : HK_CS_VDM, + .addr = root.gpu, + .start = root.cpu, + .current = root.cpu, + .end = root.cpu + initial_size, + }; + + list_inithead(&cs->node); + + bool before_gfx = (ptr == &cmd->current_cs.pre_gfx); + + /* Insert into the command buffer. We usually append to the end of the + * command buffer, except for pre-graphics streams which go right before + * the graphics workload. (This implies a level of out-of-order processing + * that's allowed by Vulkan and required for efficient + * geometry/tessellation shaders.) + */ + if (before_gfx && cmd->current_cs.gfx) { + list_addtail(&cs->node, &cmd->current_cs.gfx->node); + } else { + list_addtail(&cs->node, &cmd->control_streams); + } + + *ptr = cs; + + if (!compute) + hk_cs_init_graphics(cmd, cs); + } + + assert(*ptr != NULL); + return *ptr; +} + +static inline struct hk_cs * +hk_cmd_buffer_get_cs(struct hk_cmd_buffer *cmd, bool compute) +{ + struct hk_cs **ptr = compute ? &cmd->current_cs.cs : &cmd->current_cs.gfx; + return hk_cmd_buffer_get_cs_general(cmd, ptr, compute); +} + +void hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs, + size_t space); + +static void +hk_cmd_buffer_dirty_all(struct hk_cmd_buffer *cmd) +{ + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + struct hk_graphics_state *gfx = &cmd->state.gfx; + + vk_dynamic_graphics_state_dirty_all(dyn); + gfx->dirty = ~0; + gfx->shaders_dirty = ~0; + gfx->linked_dirty = ~0; + gfx->descriptors.root_dirty = true; +} + +static inline void +hk_cs_destroy(struct hk_cs *cs) +{ + if (cs->type == HK_CS_VDM) { + util_dynarray_fini(&cs->scissor); + util_dynarray_fini(&cs->depth_bias); + } + + free(cs); +} + +static void +hk_cmd_buffer_end_compute_internal(struct hk_cs **ptr) +{ + if (*ptr) { + struct hk_cs *cs = *ptr; + void *map = cs->current; + agx_push(map, CDM_STREAM_TERMINATE, _) + ; + + cs->current = map; + } + + *ptr = NULL; +} + +static void +hk_cmd_buffer_end_compute(struct hk_cmd_buffer *cmd) +{ + hk_cmd_buffer_end_compute_internal(&cmd->current_cs.cs); +} + +static void +hk_cmd_buffer_end_graphics(struct hk_cmd_buffer *cmd) +{ + struct hk_cs *cs = cmd->current_cs.gfx; + + if (cs) { + void *map = cs->current; + agx_push(map, VDM_STREAM_TERMINATE, _) + ; + + /* Scissor and depth bias arrays are staged to dynamic arrays on the CPU. + * When we end the control stream, they're done growing and are ready for + * upload. + */ + cs->uploaded_scissor = + hk_pool_upload(cmd, cs->scissor.data, cs->scissor.size, 64); + + cs->uploaded_zbias = + hk_pool_upload(cmd, cs->depth_bias.data, cs->depth_bias.size, 64); + + /* TODO: maybe free scissor/depth_bias now? */ + + cmd->current_cs.gfx->current = map; + cmd->current_cs.gfx = NULL; + hk_cmd_buffer_end_compute_internal(&cmd->current_cs.pre_gfx); + hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx); + } + + assert(cmd->current_cs.gfx == NULL); + + /* We just flushed out the heap use. If we want to use it again, we'll need + * to queue a free for it again. + */ + cmd->uses_heap = false; +} + +static inline uint64_t +hk_pipeline_stat_addr(struct hk_cmd_buffer *cmd, + VkQueryPipelineStatisticFlagBits stat) +{ + struct hk_root_descriptor_table *root = &cmd->state.gfx.descriptors.root; + VkQueryPipelineStatisticFlags flags = root->draw.pipeline_stats_flags; + + if (flags & stat) { + assert(!cmd->in_meta && "queries paused for meta"); + assert(util_bitcount(stat) == 1 && "by construction"); + + /* Prefix sum to determine the compacted index in the query pool */ + uint32_t index = util_bitcount(flags & (stat - 1)); + + return root->draw.pipeline_stats + (sizeof(uint64_t) * index); + } else { + /* Query disabled */ + return 0; + } +} + +void hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd, + const VkCommandBufferBeginInfo *pBeginInfo); +void hk_cmd_buffer_begin_compute(struct hk_cmd_buffer *cmd, + const VkCommandBufferBeginInfo *pBeginInfo); + +void hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd); +void hk_cmd_invalidate_compute_state(struct hk_cmd_buffer *cmd); + +void hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count, + const gl_shader_stage *stages, + struct vk_shader **const shaders); + +void hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd, + const gl_shader_stage stage, + struct hk_api_shader *shader); + +void hk_cmd_bind_compute_shader(struct hk_cmd_buffer *cmd, + struct hk_api_shader *shader); + +void hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx, + struct hk_addr_range addr_range); + +static inline struct hk_descriptor_state * +hk_get_descriptors_state(struct hk_cmd_buffer *cmd, + VkPipelineBindPoint bind_point) +{ + switch (bind_point) { + case VK_PIPELINE_BIND_POINT_GRAPHICS: + return &cmd->state.gfx.descriptors; + case VK_PIPELINE_BIND_POINT_COMPUTE: + return &cmd->state.cs.descriptors; + default: + unreachable("Unhandled bind point"); + } +}; + +void hk_cmd_flush_wait_dep(struct hk_cmd_buffer *cmd, + const VkDependencyInfo *dep, bool wait); + +void hk_cmd_invalidate_deps(struct hk_cmd_buffer *cmd, uint32_t dep_count, + const VkDependencyInfo *deps); + +void hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd, + struct hk_descriptor_state *desc); + +void hk_meta_resolve_rendering(struct hk_cmd_buffer *cmd, + const VkRenderingInfo *pRenderingInfo); + +uint64_t hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd, + VkPipelineBindPoint bind_point); + +void hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs, + struct hk_shader *s); + +uint32_t hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s, + struct hk_linked_shader *linked); + +uint32_t hk_upload_usc_words_kernel(struct hk_cmd_buffer *cmd, + struct hk_shader *s, void *data, + size_t data_size); + +void hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b, + struct hk_cmd_buffer *cmd); + +void hk_cdm_cache_flush(struct hk_device *dev, struct hk_cs *cs); + +struct hk_grid { + bool indirect; + union { + uint32_t count[3]; + uint64_t ptr; + }; +}; + +static struct hk_grid +hk_grid(uint32_t x, uint32_t y, uint32_t z) +{ + return (struct hk_grid){.indirect = false, .count = {x, y, z}}; +} + +static struct hk_grid +hk_grid_indirect(uint64_t ptr) +{ + return (struct hk_grid){.indirect = true, .ptr = ptr}; +} + +void hk_dispatch_with_usc(struct hk_device *dev, struct hk_cs *cs, + struct hk_shader *s, uint32_t usc, + struct hk_grid grid, struct hk_grid local_size); + +static inline void +hk_dispatch_with_local_size(struct hk_cmd_buffer *cmd, struct hk_cs *cs, + struct hk_shader *s, struct hk_grid grid, + struct hk_grid local_size) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + uint32_t usc = hk_upload_usc_words(cmd, s, s->only_linked); + + hk_reserve_scratch(cmd, cs, s); + hk_dispatch_with_usc(dev, cs, s, usc, grid, local_size); +} + +static inline void +hk_dispatch(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_shader *s, + struct hk_grid grid) +{ + assert(s->info.stage == MESA_SHADER_COMPUTE); + + struct hk_grid local_size = + hk_grid(s->info.cs.local_size[0], s->info.cs.local_size[1], + s->info.cs.local_size[2]); + + if (!grid.indirect) { + grid.count[0] *= local_size.count[0]; + grid.count[1] *= local_size.count[1]; + grid.count[2] *= local_size.count[2]; + } + + hk_dispatch_with_local_size(cmd, cs, s, grid, local_size); +} + +void hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value, + bool after_gfx); diff --git a/src/asahi/vulkan/hk_cmd_clear.c b/src/asahi/vulkan/hk_cmd_clear.c new file mode 100644 index 00000000000..427c5fed2a1 --- /dev/null +++ b/src/asahi/vulkan/hk_cmd_clear.c @@ -0,0 +1,196 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "agx_formats.h" +#include "hk_cmd_buffer.h" + +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_image.h" +#include "hk_image_view.h" +#include "hk_physical_device.h" + +#include "vk_format.h" +#include "vk_meta.h" + +static VkImageViewType +render_view_type(VkImageType image_type, unsigned layer_count) +{ + switch (image_type) { + case VK_IMAGE_TYPE_1D: + return layer_count == 1 ? VK_IMAGE_VIEW_TYPE_1D + : VK_IMAGE_VIEW_TYPE_1D_ARRAY; + case VK_IMAGE_TYPE_2D: + return layer_count == 1 ? VK_IMAGE_VIEW_TYPE_2D + : VK_IMAGE_VIEW_TYPE_2D_ARRAY; + case VK_IMAGE_TYPE_3D: + return VK_IMAGE_VIEW_TYPE_3D; + default: + unreachable("Invalid image type"); + } +} + +static void +clear_image(struct hk_cmd_buffer *cmd, struct hk_image *image, + VkImageLayout image_layout, VkFormat format, + const VkClearValue *clear_value, uint32_t range_count, + const VkImageSubresourceRange *ranges) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + ASSERTED VkResult result; + + for (uint32_t r = 0; r < range_count; r++) { + const uint32_t level_count = + vk_image_subresource_level_count(&image->vk, &ranges[r]); + + for (uint32_t l = 0; l < level_count; l++) { + const uint32_t level = ranges[r].baseMipLevel + l; + + const VkExtent3D level_extent = + vk_image_mip_level_extent(&image->vk, level); + + uint32_t base_array_layer, layer_count; + if (image->vk.image_type == VK_IMAGE_TYPE_3D) { + base_array_layer = 0; + layer_count = level_extent.depth; + } else { + base_array_layer = ranges[r].baseArrayLayer; + layer_count = + vk_image_subresource_layer_count(&image->vk, &ranges[r]); + } + + const VkImageViewUsageCreateInfo view_usage_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, + .usage = (ranges[r].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) + ? VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT + : VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + }; + const VkImageViewCreateInfo view_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA, + .pNext = &view_usage_info, + .image = hk_image_to_handle(image), + .viewType = render_view_type(image->vk.image_type, layer_count), + .format = format, + .subresourceRange = + { + .aspectMask = image->vk.aspects, + .baseMipLevel = level, + .levelCount = 1, + .baseArrayLayer = base_array_layer, + .layerCount = layer_count, + }, + }; + + /* We use vk_meta_create_image_view here for lifetime managemnt */ + VkImageView view; + result = + vk_meta_create_image_view(&cmd->vk, &dev->meta, &view_info, &view); + assert(result == VK_SUCCESS); + + VkRenderingInfo render = { + .sType = VK_STRUCTURE_TYPE_RENDERING_INFO, + .renderArea = + { + .offset = {0, 0}, + .extent = {level_extent.width, level_extent.height}, + }, + .layerCount = layer_count, + }; + + VkRenderingAttachmentInfo vk_att = { + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = view, + .imageLayout = image_layout, + .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .clearValue = *clear_value, + }; + + if (ranges[r].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { + render.colorAttachmentCount = 1; + render.pColorAttachments = &vk_att; + } + if (ranges[r].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) + render.pDepthAttachment = &vk_att; + if (ranges[r].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) + render.pStencilAttachment = &vk_att; + + hk_CmdBeginRendering(hk_cmd_buffer_to_handle(cmd), &render); + hk_CmdEndRendering(hk_cmd_buffer_to_handle(cmd)); + } + } +} + +static VkFormat +vk_packed_int_format_for_size(unsigned size_B) +{ + switch (size_B) { + case 1: + return VK_FORMAT_R8_UINT; + case 2: + return VK_FORMAT_R16_UINT; + case 4: + return VK_FORMAT_R32_UINT; + case 8: + return VK_FORMAT_R32G32_UINT; + case 16: + return VK_FORMAT_R32G32B32A32_UINT; + default: + unreachable("Invalid image format size"); + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdClearColorImage(VkCommandBuffer commandBuffer, VkImage _image, + VkImageLayout imageLayout, + const VkClearColorValue *pColor, uint32_t rangeCount, + const VkImageSubresourceRange *pRanges) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_image, image, _image); + + VkClearValue clear_value = { + .color = *pColor, + }; + + VkFormat vk_format = image->vk.format; + if (vk_format == VK_FORMAT_R64_UINT || vk_format == VK_FORMAT_R64_SINT) + vk_format = VK_FORMAT_R32G32_UINT; + + enum pipe_format p_format = vk_format_to_pipe_format(vk_format); + assert(p_format != PIPE_FORMAT_NONE); + + if (!agx_pixel_format[p_format].renderable) { + memset(&clear_value, 0, sizeof(clear_value)); + util_format_pack_rgba(p_format, clear_value.color.uint32, pColor->uint32, + 1); + + unsigned bpp = util_format_get_blocksize(p_format); + vk_format = vk_packed_int_format_for_size(bpp); + } + + clear_image(cmd, image, imageLayout, vk_format, &clear_value, rangeCount, + pRanges); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, VkImage _image, + VkImageLayout imageLayout, + const VkClearDepthStencilValue *pDepthStencil, + uint32_t rangeCount, + const VkImageSubresourceRange *pRanges) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_image, image, _image); + + const VkClearValue clear_value = { + .depthStencil = *pDepthStencil, + }; + + clear_image(cmd, image, imageLayout, image->vk.format, &clear_value, + rangeCount, pRanges); +} diff --git a/src/asahi/vulkan/hk_cmd_dispatch.c b/src/asahi/vulkan/hk_cmd_dispatch.c new file mode 100644 index 00000000000..54c1a454992 --- /dev/null +++ b/src/asahi/vulkan/hk_cmd_dispatch.c @@ -0,0 +1,249 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "shaders/query.h" +#include "vulkan/vulkan_core.h" +#include "agx_helpers.h" +#include "agx_linker.h" +#include "agx_nir_lower_gs.h" +#include "agx_pack.h" +#include "agx_scratch.h" +#include "agx_tilebuffer.h" +#include "hk_buffer.h" +#include "hk_cmd_buffer.h" +#include "hk_descriptor_set.h" +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_physical_device.h" +#include "hk_shader.h" +#include "pool.h" + +void +hk_cmd_buffer_begin_compute(struct hk_cmd_buffer *cmd, + const VkCommandBufferBeginInfo *pBeginInfo) +{ +} + +void +hk_cmd_invalidate_compute_state(struct hk_cmd_buffer *cmd) +{ + memset(&cmd->state.cs, 0, sizeof(cmd->state.cs)); +} + +void +hk_cmd_bind_compute_shader(struct hk_cmd_buffer *cmd, + struct hk_api_shader *shader) +{ + cmd->state.cs.shader = shader; +} + +void +hk_cdm_cache_flush(struct hk_device *dev, struct hk_cs *cs) +{ + assert(cs->type == HK_CS_CDM); + assert(cs->current + AGX_CDM_BARRIER_LENGTH < cs->end && + "caller must ensure space"); + + uint8_t *out = cs->current; + + agx_push(out, CDM_BARRIER, cfg) { + cfg.unk_5 = true; + cfg.unk_6 = true; + cfg.unk_8 = true; + // cfg.unk_11 = true; + // cfg.unk_20 = true; + if (dev->dev.params.num_clusters_total > 1) { + // cfg.unk_24 = true; + if (dev->dev.params.gpu_generation == 13) { + cfg.unk_4 = true; + // cfg.unk_26 = true; + } + } + + /* With multiple launches in the same CDM stream, we can get cache + * coherency (? or sync?) issues. We hit this with blits, which need - in + * between dispatches - need the PBE cache to be flushed and the texture + * cache to be invalidated. Until we know what bits mean what exactly, + * let's just set these after every launch to be safe. We can revisit in + * the future when we figure out what the bits mean. + */ + cfg.unk_0 = true; + cfg.unk_1 = true; + cfg.unk_2 = true; + cfg.usc_cache_inval = true; + cfg.unk_4 = true; + cfg.unk_5 = true; + cfg.unk_6 = true; + cfg.unk_7 = true; + cfg.unk_8 = true; + cfg.unk_9 = true; + cfg.unk_10 = true; + cfg.unk_11 = true; + cfg.unk_12 = true; + cfg.unk_13 = true; + cfg.unk_14 = true; + cfg.unk_15 = true; + cfg.unk_16 = true; + cfg.unk_17 = true; + cfg.unk_18 = true; + cfg.unk_19 = true; + } + + cs->current = out; +} + +/* + * Enqueue workgroups to a given CDM control stream with a given prepared USC + * words. This does not interact with any global state, so it is suitable for + * internal dispatches that do not save/restore state. That can be simpler / + * lower overhead than vk_meta for special operations that logically operate + * as graphics. + */ +void +hk_dispatch_with_usc(struct hk_device *dev, struct hk_cs *cs, + struct hk_shader *s, uint32_t usc, struct hk_grid grid, + struct hk_grid local_size) +{ + assert(cs->current + 0x2000 < cs->end && "should have ensured space"); + uint8_t *out = cs->current; + + agx_push(out, CDM_LAUNCH_WORD_0, cfg) { + if (grid.indirect) + cfg.mode = AGX_CDM_MODE_INDIRECT_GLOBAL; + else + cfg.mode = AGX_CDM_MODE_DIRECT; + + /* For now, always bind the txf sampler and nothing else */ + cfg.sampler_state_register_count = 1; + + cfg.uniform_register_count = s->b.info.push_count; + cfg.preshader_register_count = s->b.info.nr_preamble_gprs; + } + + agx_push(out, CDM_LAUNCH_WORD_1, cfg) { + cfg.pipeline = usc; + } + + /* Added in G14X */ + if (dev->dev.params.gpu_generation >= 14 && + dev->dev.params.num_clusters_total > 1) { + + agx_push(out, CDM_UNK_G14X, cfg) + ; + } + + assert(!local_size.indirect); + + if (grid.indirect) { + agx_push(out, CDM_INDIRECT, cfg) { + cfg.address_hi = grid.ptr >> 32; + cfg.address_lo = grid.ptr & BITFIELD64_MASK(32); + } + } else { + agx_push(out, CDM_GLOBAL_SIZE, cfg) { + cfg.x = grid.count[0]; + cfg.y = grid.count[1]; + cfg.z = grid.count[2]; + } + } + + agx_push(out, CDM_LOCAL_SIZE, cfg) { + cfg.x = local_size.count[0]; + cfg.y = local_size.count[1]; + cfg.z = local_size.count[2]; + } + + cs->current = out; + hk_cdm_cache_flush(dev, cs); +} + +static void +dispatch(struct hk_cmd_buffer *cmd, struct hk_grid grid) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + struct hk_shader *s = hk_only_variant(cmd->state.cs.shader); + struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true /* compute */); + if (!cs) + return; + + uint64_t stat = hk_pipeline_stat_addr( + cmd, VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT); + + if (stat) { + uint32_t local_size_threads = s->info.cs.local_size[0] * + s->info.cs.local_size[1] * + s->info.cs.local_size[2]; + + struct libagx_cs_invocation_params p = { + .grid = cmd->state.cs.descriptors.root.cs.group_count_addr, + .local_size_threads = local_size_threads, + .statistic = stat, + }; + + struct hk_shader *s = + hk_meta_kernel(dev, agx_nir_increment_cs_invocations, NULL, 0); + + uint64_t params = hk_pool_upload(cmd, &p, sizeof(p), 8); + uint32_t usc = + hk_upload_usc_words_kernel(cmd, s, ¶ms, sizeof(params)); + + hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1), hk_grid(1, 1, 1)); + } + + hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */); + hk_dispatch(cmd, cs, s, grid); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t baseGroupX, + uint32_t baseGroupY, uint32_t baseGroupZ, + uint32_t groupCountX, uint32_t groupCountY, + uint32_t groupCountZ) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_descriptor_state *desc = &cmd->state.cs.descriptors; + if (desc->push_dirty) + hk_cmd_buffer_flush_push_descriptors(cmd, desc); + + desc->root.cs.base_group[0] = baseGroupX; + desc->root.cs.base_group[1] = baseGroupY; + desc->root.cs.base_group[2] = baseGroupZ; + + /* We don't want to key the shader to whether we're indirectly dispatching, + * so treat everything as indirect. + */ + VkDispatchIndirectCommand group_count = { + .x = groupCountX, + .y = groupCountY, + .z = groupCountZ, + }; + + desc->root.cs.group_count_addr = + hk_pool_upload(cmd, &group_count, sizeof(group_count), 8); + + dispatch(cmd, hk_grid(groupCountX, groupCountY, groupCountZ)); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, + VkDeviceSize offset) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_buffer, buffer, _buffer); + struct hk_descriptor_state *desc = &cmd->state.cs.descriptors; + if (desc->push_dirty) + hk_cmd_buffer_flush_push_descriptors(cmd, desc); + + desc->root.cs.base_group[0] = 0; + desc->root.cs.base_group[1] = 0; + desc->root.cs.base_group[2] = 0; + + uint64_t dispatch_addr = hk_buffer_address(buffer, offset); + assert(dispatch_addr != 0); + + desc->root.cs.group_count_addr = dispatch_addr; + dispatch(cmd, hk_grid_indirect(dispatch_addr)); +} diff --git a/src/asahi/vulkan/hk_cmd_draw.c b/src/asahi/vulkan/hk_cmd_draw.c new file mode 100644 index 00000000000..78a7a922d15 --- /dev/null +++ b/src/asahi/vulkan/hk_cmd_draw.c @@ -0,0 +1,3737 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include +#include "agx_bg_eot.h" +#include "agx_bo.h" +#include "agx_compile.h" +#include "agx_compiler.h" +#include "agx_device.h" +#include "agx_helpers.h" +#include "agx_linker.h" +#include "agx_nir_lower_gs.h" +#include "agx_nir_lower_vbo.h" +#include "agx_ppp.h" +#include "agx_tilebuffer.h" +#include "agx_usc.h" +#include "agx_uvs.h" +#include "hk_buffer.h" +#include "hk_cmd_buffer.h" +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_image.h" +#include "hk_image_view.h" +#include "hk_physical_device.h" +#include "hk_private.h" +#include "hk_shader.h" + +#include "asahi/genxml/agx_pack.h" +#include "asahi/lib/libagx_shaders.h" +#include "asahi/lib/shaders/geometry.h" +#include "shaders/query.h" +#include "shaders/tessellator.h" +#include "util/bitpack_helpers.h" +#include "util/blend.h" +#include "util/format/format_utils.h" +#include "util/format/u_formats.h" +#include "util/macros.h" +#include "util/ralloc.h" +#include "vulkan/vulkan_core.h" +#include "layout.h" +#include "nir.h" +#include "nir_builder.h" +#include "nir_lower_blend.h" +#include "nir_xfb_info.h" +#include "pool.h" +#include "shader_enums.h" +#include "vk_blend.h" +#include "vk_enum_to_str.h" +#include "vk_format.h" +#include "vk_graphics_state.h" +#include "vk_pipeline.h" +#include "vk_render_pass.h" +#include "vk_standard_sample_locations.h" +#include "vk_util.h" + +#define IS_DIRTY(bit) BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_##bit) + +#define IS_SHADER_DIRTY(bit) \ + (cmd->state.gfx.shaders_dirty & BITFIELD_BIT(MESA_SHADER_##bit)) + +#define IS_LINKED_DIRTY(bit) \ + (cmd->state.gfx.linked_dirty & BITFIELD_BIT(MESA_SHADER_##bit)) + +struct hk_draw { + struct hk_grid b; + struct hk_addr_range index; + bool indexed; + uint32_t start; + uint32_t index_bias; + uint32_t start_instance; + + /* Indicates that the indirect draw consists of raw VDM commands and should + * be stream linked to. Used to accelerate tessellation. + */ + bool raw; + + /* Set within hk_draw() but here so geometry/tessellation can override */ + bool restart; + enum agx_index_size index_size; +}; + +static struct hk_draw +hk_draw_indirect(uint64_t ptr) +{ + return (struct hk_draw){.b = hk_grid_indirect(ptr)}; +} + +static struct hk_draw +hk_draw_indexed_indirect(uint64_t ptr, struct hk_addr_range index, + enum agx_index_size index_size, bool restart) +{ + return (struct hk_draw){ + .b = hk_grid_indirect(ptr), + .index = index, + .indexed = true, + .index_size = index_size, + .restart = restart, + }; +} + +/* XXX: deduplicate */ +static inline enum mesa_prim +vk_conv_topology(VkPrimitiveTopology topology) +{ + switch (topology) { + case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: + return MESA_PRIM_POINTS; + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: + return MESA_PRIM_LINES; + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: + return MESA_PRIM_LINE_STRIP; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wswitch" + case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA: +#pragma GCC diagnostic pop + return MESA_PRIM_TRIANGLES; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: + return MESA_PRIM_TRIANGLE_STRIP; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: + return MESA_PRIM_TRIANGLE_FAN; + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: + return MESA_PRIM_LINES_ADJACENCY; + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: + return MESA_PRIM_LINE_STRIP_ADJACENCY; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: + return MESA_PRIM_TRIANGLES_ADJACENCY; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: + return MESA_PRIM_TRIANGLE_STRIP_ADJACENCY; + case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST: + return MESA_PRIM_PATCHES; + default: + unreachable("invalid"); + } +} + +static void +hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer *cmd) +{ + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + + /* These depend on color attachment count */ + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES); + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES); + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS); + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS); + + /* These depend on the depth/stencil format */ + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE); + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE); + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE); + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE); + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS); + + /* This may depend on render targets for ESO */ + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES); +} + +void +hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd, + const VkCommandBufferBeginInfo *pBeginInfo) +{ + if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY && + (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) { + char gcbiar_data[VK_GCBIARR_DATA_SIZE(HK_MAX_RTS)]; + const VkRenderingInfo *resume_info = + vk_get_command_buffer_inheritance_as_rendering_resume( + cmd->vk.level, pBeginInfo, gcbiar_data); + if (resume_info) { + hk_CmdBeginRendering(hk_cmd_buffer_to_handle(cmd), resume_info); + } else { + const VkCommandBufferInheritanceRenderingInfo *inheritance_info = + vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level, + pBeginInfo); + assert(inheritance_info); + + struct hk_rendering_state *render = &cmd->state.gfx.render; + render->flags = inheritance_info->flags; + render->area = (VkRect2D){}; + render->layer_count = 0; + render->view_mask = inheritance_info->viewMask; + render->tilebuffer.nr_samples = inheritance_info->rasterizationSamples; + + render->color_att_count = inheritance_info->colorAttachmentCount; + for (uint32_t i = 0; i < render->color_att_count; i++) { + render->color_att[i].vk_format = + inheritance_info->pColorAttachmentFormats[i]; + } + render->depth_att.vk_format = inheritance_info->depthAttachmentFormat; + render->stencil_att.vk_format = + inheritance_info->stencilAttachmentFormat; + + hk_cmd_buffer_dirty_render_pass(cmd); + } + } + + hk_cmd_buffer_dirty_all(cmd); + + /* If multiview is disabled, always read 0. If multiview is enabled, + * hk_set_view_index will dirty the root each draw. + */ + cmd->state.gfx.descriptors.root.draw.view_index = 0; + cmd->state.gfx.descriptors.root_dirty = true; +} + +void +hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd) +{ + hk_cmd_buffer_dirty_all(cmd); + + /* From the Vulkan 1.3.275 spec: + * + * "...There is one exception to this rule - if the primary command + * buffer is inside a render pass instance, then the render pass and + * subpass state is not disturbed by executing secondary command + * buffers." + * + * We need to reset everything EXCEPT the render pass state. + */ + struct hk_rendering_state render_save = cmd->state.gfx.render; + memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx)); + cmd->state.gfx.render = render_save; +} + +static void +hk_attachment_init(struct hk_attachment *att, + const VkRenderingAttachmentInfo *info) +{ + if (info == NULL || info->imageView == VK_NULL_HANDLE) { + *att = (struct hk_attachment){ + .iview = NULL, + }; + return; + } + + VK_FROM_HANDLE(hk_image_view, iview, info->imageView); + *att = (struct hk_attachment){ + .vk_format = iview->vk.format, + .iview = iview, + }; + + if (info->resolveMode != VK_RESOLVE_MODE_NONE) { + VK_FROM_HANDLE(hk_image_view, res_iview, info->resolveImageView); + att->resolve_mode = info->resolveMode; + att->resolve_iview = res_iview; + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetRenderingAreaGranularityKHR( + VkDevice device, const VkRenderingAreaInfoKHR *pRenderingAreaInfo, + VkExtent2D *pGranularity) +{ + *pGranularity = (VkExtent2D){.width = 1, .height = 1}; +} + +static struct hk_bg_eot +hk_build_bg_eot(struct hk_cmd_buffer *cmd, const VkRenderingInfo *info, + bool store, bool partial_render, bool incomplete_render_area) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + struct hk_rendering_state *render = &cmd->state.gfx.render; + + /* Construct the key */ + struct agx_bg_eot_key key = {.tib = render->tilebuffer}; + static_assert(AGX_BG_EOT_NONE == 0, "default initializer"); + + key.tib.layered = (render->cr.layers > 1); + + bool needs_textures_for_spilled_rts = + agx_tilebuffer_spills(&render->tilebuffer) && !partial_render && !store; + + for (unsigned i = 0; i < info->colorAttachmentCount; ++i) { + const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i]; + if (att_info->imageView == VK_NULL_HANDLE) + continue; + + /* Partial render programs exist only to store/load the tilebuffer to + * main memory. When render targets are already spilled to main memory, + * there's nothing to do. + */ + if (key.tib.spilled[i] && (partial_render || store)) + continue; + + if (store) { + bool store = att_info->storeOp == VK_ATTACHMENT_STORE_OP_STORE; + + /* When resolving, we store the intermediate multisampled image as the + * resolve is a separate control stream. This could be optimized. + */ + store |= att_info->resolveMode != VK_RESOLVE_MODE_NONE; + + /* Partial renders always need to flush to memory. */ + store |= partial_render; + + key.op[i] = store ? AGX_EOT_STORE : AGX_BG_EOT_NONE; + } else { + bool load = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD; + bool clear = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR; + + /* The background program used for partial renders must always load + * whatever was stored in the mid-frame end-of-tile program. + */ + load |= partial_render; + + /* With an incomplete render area, we're forced to load back tiles and + * then use the 3D pipe for the clear. + */ + load |= incomplete_render_area; + + /* Don't read back spilled render targets, they're already in memory */ + load &= !key.tib.spilled[i]; + + key.op[i] = load ? AGX_BG_LOAD + : clear ? AGX_BG_CLEAR + : AGX_BG_EOT_NONE; + } + } + + /* Begin building the pipeline */ + size_t usc_size = agx_usc_size(3 + HK_MAX_RTS); + struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64); + if (!t.cpu) + return (struct hk_bg_eot){.usc = t.gpu}; + + struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size); + + bool uses_txf = false; + unsigned uniforms = 0; + unsigned nr_tex = 0; + + for (unsigned rt = 0; rt < HK_MAX_RTS; ++rt) { + const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[rt]; + struct hk_image_view *iview = render->color_att[rt].iview; + + if (key.op[rt] == AGX_BG_LOAD) { + uses_txf = true; + + uint32_t index = key.tib.layered + ? iview->planes[0].layered_background_desc_index + : iview->planes[0].background_desc_index; + + agx_usc_pack(&b, TEXTURE, cfg) { + /* Shifted to match eMRT indexing, could be optimized */ + cfg.start = rt * 2; + cfg.count = 1; + cfg.buffer = dev->images.bo->ptr.gpu + index * AGX_TEXTURE_LENGTH; + } + + nr_tex = (rt * 2) + 1; + } else if (key.op[rt] == AGX_BG_CLEAR) { + static_assert(sizeof(att_info->clearValue.color) == 16, "fixed ABI"); + uint64_t colour = + hk_pool_upload(cmd, &att_info->clearValue.color, 16, 16); + + agx_usc_uniform(&b, 4 + (8 * rt), 8, colour); + uniforms = MAX2(uniforms, 4 + (8 * rt) + 8); + } else if (key.op[rt] == AGX_EOT_STORE) { + uint32_t index = key.tib.layered + ? iview->planes[0].layered_eot_pbe_desc_index + : iview->planes[0].eot_pbe_desc_index; + + agx_usc_pack(&b, TEXTURE, cfg) { + cfg.start = rt; + cfg.count = 1; + cfg.buffer = dev->images.bo->ptr.gpu + index * AGX_TEXTURE_LENGTH; + } + + nr_tex = rt + 1; + } + } + + if (needs_textures_for_spilled_rts) { + hk_usc_upload_spilled_rt_descs(&b, cmd); + uniforms = MAX2(uniforms, 4); + } + + if (uses_txf) { + agx_usc_push_packed(&b, SAMPLER, dev->rodata.txf_sampler); + } + + /* For attachmentless rendering, we don't know the sample count until + * draw-time. But we have trivial bg/eot programs in that case too. + */ + if (key.tib.nr_samples >= 1) { + agx_usc_push_packed(&b, SHARED, &key.tib.usc); + } else { + assert(key.tib.sample_size_B == 0); + agx_usc_shared_none(&b); + + key.tib.nr_samples = 1; + } + + /* Get the shader */ + key.reserved_preamble = uniforms; + /* XXX: locking? */ + struct agx_bg_eot_shader *shader = agx_get_bg_eot_shader(&dev->bg_eot, &key); + + agx_usc_pack(&b, SHADER, cfg) { + cfg.code = shader->ptr; + cfg.unk_2 = 0; + } + + agx_usc_pack(&b, REGISTERS, cfg) + cfg.register_count = shader->info.nr_gprs; + + if (shader->info.has_preamble) { + agx_usc_pack(&b, PRESHADER, cfg) { + cfg.code = shader->ptr + shader->info.preamble_offset; + } + } else { + agx_usc_pack(&b, NO_PRESHADER, cfg) + ; + } + + struct hk_bg_eot ret = {.usc = t.gpu}; + + agx_pack(&ret.counts, COUNTS, cfg) { + cfg.uniform_register_count = shader->info.push_count; + cfg.preshader_register_count = shader->info.nr_preamble_gprs; + cfg.texture_state_register_count = nr_tex; + cfg.sampler_state_register_count = + agx_translate_sampler_state_count(uses_txf ? 1 : 0, false); + } + + return ret; +} + +static bool +is_aligned(unsigned x, unsigned pot_alignment) +{ + assert(util_is_power_of_two_nonzero(pot_alignment)); + return (x & (pot_alignment - 1)) == 0; +} + +static void +hk_merge_render_iview(struct hk_rendering_state *render, + struct hk_image_view *iview) +{ + if (iview) { + unsigned samples = iview->vk.image->samples; + /* TODO: is this right for ycbcr? */ + unsigned level = iview->vk.base_mip_level; + unsigned width = u_minify(iview->vk.image->extent.width, level); + unsigned height = u_minify(iview->vk.image->extent.height, level); + + assert(render->tilebuffer.nr_samples == 0 || + render->tilebuffer.nr_samples == samples); + render->tilebuffer.nr_samples = samples; + + /* TODO: Is this merging logic sound? Not sure how this is supposed to + * work conceptually. + */ + render->cr.width = MAX2(render->cr.width, width); + render->cr.height = MAX2(render->cr.height, height); + } +} + +static void +hk_pack_zls_control(struct agx_zls_control_packed *packed, + struct ail_layout *z_layout, struct ail_layout *s_layout, + const VkRenderingAttachmentInfo *attach_z, + const VkRenderingAttachmentInfo *attach_s, + bool incomplete_render_area, bool partial_render) +{ + agx_pack(packed, ZLS_CONTROL, zls_control) { + if (z_layout) { + zls_control.z_store_enable = + attach_z->storeOp == VK_ATTACHMENT_STORE_OP_STORE || + attach_z->resolveMode != VK_RESOLVE_MODE_NONE || partial_render; + + zls_control.z_load_enable = + attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render || + incomplete_render_area; + + if (ail_is_compressed(z_layout)) { + zls_control.z_compress_1 = true; + zls_control.z_compress_2 = true; + } + + if (z_layout->format == PIPE_FORMAT_Z16_UNORM) { + zls_control.z_format = AGX_ZLS_FORMAT_16; + } else { + zls_control.z_format = AGX_ZLS_FORMAT_32F; + } + } + + if (s_layout) { + /* TODO: + * Fail + * dEQP-VK.renderpass.dedicated_allocation.formats.d32_sfloat_s8_uint.input.dont_care.store.self_dep_clear_draw_use_input_aspect + * without the force + * .. maybe a VkRenderPass emulation bug. + */ + zls_control.s_store_enable = + attach_s->storeOp == VK_ATTACHMENT_STORE_OP_STORE || + attach_s->resolveMode != VK_RESOLVE_MODE_NONE || partial_render || + true; + + zls_control.s_load_enable = + attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render || + incomplete_render_area; + + if (ail_is_compressed(s_layout)) { + zls_control.s_compress_1 = true; + zls_control.s_compress_2 = true; + } + } + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdBeginRendering(VkCommandBuffer commandBuffer, + const VkRenderingInfo *pRenderingInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_rendering_state *render = &cmd->state.gfx.render; + + memset(render, 0, sizeof(*render)); + + render->flags = pRenderingInfo->flags; + render->area = pRenderingInfo->renderArea; + render->view_mask = pRenderingInfo->viewMask; + render->layer_count = pRenderingInfo->layerCount; + render->tilebuffer.nr_samples = 0; + + const uint32_t layer_count = render->view_mask + ? util_last_bit(render->view_mask) + : render->layer_count; + + render->color_att_count = pRenderingInfo->colorAttachmentCount; + for (uint32_t i = 0; i < render->color_att_count; i++) { + hk_attachment_init(&render->color_att[i], + &pRenderingInfo->pColorAttachments[i]); + } + + hk_attachment_init(&render->depth_att, pRenderingInfo->pDepthAttachment); + hk_attachment_init(&render->stencil_att, pRenderingInfo->pStencilAttachment); + + for (uint32_t i = 0; i < render->color_att_count; i++) { + hk_merge_render_iview(render, render->color_att[i].iview); + } + + hk_merge_render_iview(render, + render->depth_att.iview ?: render->stencil_att.iview); + + /* Infer for attachmentless. samples is inferred at draw-time. */ + render->cr.width = + MAX2(render->cr.width, render->area.offset.x + render->area.extent.width); + + render->cr.height = MAX2(render->cr.height, + render->area.offset.y + render->area.extent.height); + + render->cr.layers = layer_count; + + /* Choose a tilebuffer layout given the framebuffer key */ + enum pipe_format formats[HK_MAX_RTS] = {0}; + for (unsigned i = 0; i < render->color_att_count; ++i) { + formats[i] = vk_format_to_pipe_format(render->color_att[i].vk_format); + } + + /* For now, we force layered=true since it makes compatibility problems way + * easier. + */ + render->tilebuffer = agx_build_tilebuffer_layout( + formats, render->color_att_count, render->tilebuffer.nr_samples, true); + + hk_cmd_buffer_dirty_render_pass(cmd); + + /* Determine whether the render area is complete, enabling us to use a + * fast-clear. + * + * TODO: If it is incomplete but tile aligned, it should be possibly to fast + * clear with the appropriate settings. This is critical for performance. + */ + bool incomplete_render_area = + render->area.offset.x > 0 || render->area.offset.y > 0 || + render->area.extent.width < render->cr.width || + render->area.extent.height < render->cr.height || + (render->view_mask && + render->view_mask != BITFIELD64_MASK(render->cr.layers)); + + render->cr.bg.main = hk_build_bg_eot(cmd, pRenderingInfo, false, false, + incomplete_render_area); + render->cr.bg.partial = + hk_build_bg_eot(cmd, pRenderingInfo, false, true, incomplete_render_area); + + render->cr.eot.main = + hk_build_bg_eot(cmd, pRenderingInfo, true, false, incomplete_render_area); + render->cr.eot.partial = render->cr.eot.main; + + render->cr.isp_bgobjvals = 0x300; + + const VkRenderingAttachmentInfo *attach_z = pRenderingInfo->pDepthAttachment; + const VkRenderingAttachmentInfo *attach_s = + pRenderingInfo->pStencilAttachment; + + render->cr.iogpu_unk_214 = 0xc000; + + struct ail_layout *z_layout = NULL, *s_layout = NULL; + + if (attach_z != NULL && attach_z != VK_NULL_HANDLE && attach_z->imageView) { + struct hk_image_view *view = render->depth_att.iview; + struct hk_image *image = + container_of(view->vk.image, struct hk_image, vk); + + z_layout = &image->planes[0].layout; + + unsigned level = view->vk.base_mip_level; + unsigned first_layer = view->vk.base_array_layer; + + const struct util_format_description *desc = + util_format_description(vk_format_to_pipe_format(view->vk.format)); + + assert(desc->format == PIPE_FORMAT_Z32_FLOAT || + desc->format == PIPE_FORMAT_Z16_UNORM || + desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); + + render->cr.depth.buffer = + hk_image_base_address(image, 0) + + ail_get_layer_level_B(z_layout, first_layer, level); + + /* Main stride in pages */ + assert((z_layout->depth_px == 1 || + is_aligned(z_layout->layer_stride_B, AIL_PAGESIZE)) && + "Page aligned Z layers"); + + unsigned stride_pages = z_layout->layer_stride_B / AIL_PAGESIZE; + render->cr.depth.stride = ((stride_pages - 1) << 14) | 1; + + assert(z_layout->tiling != AIL_TILING_LINEAR && "must tile"); + + if (ail_is_compressed(z_layout)) { + render->cr.depth.meta = + hk_image_base_address(image, 0) + z_layout->metadata_offset_B + + (first_layer * z_layout->compression_layer_stride_B) + + z_layout->level_offsets_compressed_B[level]; + + /* Meta stride in cache lines */ + assert( + is_aligned(z_layout->compression_layer_stride_B, AIL_CACHELINE) && + "Cacheline aligned Z meta layers"); + + unsigned stride_lines = + z_layout->compression_layer_stride_B / AIL_CACHELINE; + render->cr.depth.meta_stride = (stride_lines - 1) << 14; + } + + float clear_depth = attach_z->clearValue.depthStencil.depth; + + if (z_layout->format == PIPE_FORMAT_Z16_UNORM) { + render->cr.isp_bgobjdepth = _mesa_float_to_unorm(clear_depth, 16); + render->cr.iogpu_unk_214 |= 0x40000; + } else { + render->cr.isp_bgobjdepth = fui(clear_depth); + } + } + + if (attach_s != NULL && attach_s != VK_NULL_HANDLE && attach_s->imageView) { + struct hk_image_view *view = render->stencil_att.iview; + struct hk_image *image = + container_of(view->vk.image, struct hk_image, vk); + + /* Stencil is always the last plane (possibly the only plane) */ + unsigned plane = image->plane_count - 1; + s_layout = &image->planes[plane].layout; + assert(s_layout->format == PIPE_FORMAT_S8_UINT); + + unsigned level = view->vk.base_mip_level; + unsigned first_layer = view->vk.base_array_layer; + + render->cr.stencil.buffer = + hk_image_base_address(image, plane) + + ail_get_layer_level_B(s_layout, first_layer, level); + + /* Main stride in pages */ + assert((s_layout->depth_px == 1 || + is_aligned(s_layout->layer_stride_B, AIL_PAGESIZE)) && + "Page aligned S layers"); + unsigned stride_pages = s_layout->layer_stride_B / AIL_PAGESIZE; + render->cr.stencil.stride = ((stride_pages - 1) << 14) | 1; + + if (ail_is_compressed(s_layout)) { + render->cr.stencil.meta = + hk_image_base_address(image, plane) + s_layout->metadata_offset_B + + (first_layer * s_layout->compression_layer_stride_B) + + s_layout->level_offsets_compressed_B[level]; + + /* Meta stride in cache lines */ + assert( + is_aligned(s_layout->compression_layer_stride_B, AIL_CACHELINE) && + "Cacheline aligned S meta layers"); + + unsigned stride_lines = + s_layout->compression_layer_stride_B / AIL_CACHELINE; + + render->cr.stencil.meta_stride = (stride_lines - 1) << 14; + } + + render->cr.isp_bgobjvals |= attach_s->clearValue.depthStencil.stencil; + } + + hk_pack_zls_control(&render->cr.zls_control, z_layout, s_layout, attach_z, + attach_s, incomplete_render_area, false); + + hk_pack_zls_control(&render->cr.zls_control_partial, z_layout, s_layout, + attach_z, attach_s, incomplete_render_area, true); + + /* If multiview is disabled, always read 0. If multiview is enabled, + * hk_set_view_index will dirty the root each draw. + */ + cmd->state.gfx.descriptors.root.draw.view_index = 0; + cmd->state.gfx.descriptors.root_dirty = true; + + if (render->flags & VK_RENDERING_RESUMING_BIT) + return; + + /* The first control stream of the render pass is special since it gets + * the clears. Create it and swap in the clear. + */ + assert(!cmd->current_cs.gfx && "not already in a render pass"); + struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */); + if (!cs) + return; + + cs->cr.bg.main = render->cr.bg.main; + cs->cr.zls_control = render->cr.zls_control; + + /* Reordering barrier for post-gfx, in case we had any. */ + hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx); + + /* Don't reorder compute across render passes. + * + * TODO: Check if this is necessary if the proper PipelineBarriers are + * handled... there may be CTS bugs... + */ + hk_cmd_buffer_end_compute(cmd); + + if (incomplete_render_area) { + uint32_t clear_count = 0; + VkClearAttachment clear_att[HK_MAX_RTS + 1]; + for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) { + const VkRenderingAttachmentInfo *att_info = + &pRenderingInfo->pColorAttachments[i]; + if (att_info->imageView == VK_NULL_HANDLE || + att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR) + continue; + + clear_att[clear_count++] = (VkClearAttachment){ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .colorAttachment = i, + .clearValue = att_info->clearValue, + }; + } + + clear_att[clear_count] = (VkClearAttachment){ + .aspectMask = 0, + }; + + if (attach_z && attach_z->imageView != VK_NULL_HANDLE && + attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT; + clear_att[clear_count].clearValue.depthStencil.depth = + attach_z->clearValue.depthStencil.depth; + } + + if (attach_s != NULL && attach_s->imageView != VK_NULL_HANDLE && + attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; + clear_att[clear_count].clearValue.depthStencil.stencil = + attach_s->clearValue.depthStencil.stencil; + } + + if (clear_att[clear_count].aspectMask != 0) + clear_count++; + + if (clear_count > 0) { + const VkClearRect clear_rect = { + .rect = render->area, + .baseArrayLayer = 0, + .layerCount = render->view_mask ? 1 : render->layer_count, + }; + + hk_CmdClearAttachments(hk_cmd_buffer_to_handle(cmd), clear_count, + clear_att, 1, &clear_rect); + } + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdEndRendering(VkCommandBuffer commandBuffer) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_rendering_state *render = &cmd->state.gfx.render; + + hk_cmd_buffer_end_graphics(cmd); + + bool need_resolve = false; + + /* Translate render state back to VK for meta */ + VkRenderingAttachmentInfo vk_color_att[HK_MAX_RTS]; + for (uint32_t i = 0; i < render->color_att_count; i++) { + if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE) + need_resolve = true; + + vk_color_att[i] = (VkRenderingAttachmentInfo){ + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = hk_image_view_to_handle(render->color_att[i].iview), + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + .resolveMode = render->color_att[i].resolve_mode, + .resolveImageView = + hk_image_view_to_handle(render->color_att[i].resolve_iview), + .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + } + + const VkRenderingAttachmentInfo vk_depth_att = { + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = hk_image_view_to_handle(render->depth_att.iview), + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + .resolveMode = render->depth_att.resolve_mode, + .resolveImageView = + hk_image_view_to_handle(render->depth_att.resolve_iview), + .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE) + need_resolve = true; + + const VkRenderingAttachmentInfo vk_stencil_att = { + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = hk_image_view_to_handle(render->stencil_att.iview), + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + .resolveMode = render->stencil_att.resolve_mode, + .resolveImageView = + hk_image_view_to_handle(render->stencil_att.resolve_iview), + .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) + need_resolve = true; + + const VkRenderingInfo vk_render = { + .sType = VK_STRUCTURE_TYPE_RENDERING_INFO, + .renderArea = render->area, + .layerCount = render->layer_count, + .viewMask = render->view_mask, + .colorAttachmentCount = render->color_att_count, + .pColorAttachments = vk_color_att, + .pDepthAttachment = &vk_depth_att, + .pStencilAttachment = &vk_stencil_att, + }; + + if (render->flags & VK_RENDERING_SUSPENDING_BIT) + need_resolve = false; + + memset(render, 0, sizeof(*render)); + + if (need_resolve) { + hk_meta_resolve_rendering(cmd, &vk_render); + } +} + +static uint64_t +hk_geometry_state(struct hk_cmd_buffer *cmd) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + /* We tie heap allocation to geometry state allocation, so allocate now. */ + if (unlikely(!dev->heap)) { + size_t size = 128 * 1024 * 1024; + dev->heap = agx_bo_create(&dev->dev, size, 0, "Geometry heap"); + + /* The geometry state buffer is initialized here and then is treated by + * the CPU as rodata, even though the GPU uses it for scratch internally. + */ + off_t off = dev->rodata.geometry_state - dev->rodata.bo->ptr.gpu; + struct agx_geometry_state *map = dev->rodata.bo->ptr.cpu + off; + + *map = (struct agx_geometry_state){ + .heap = dev->heap->ptr.gpu, + .heap_size = size, + }; + } + + /* We need to free all allocations after each command buffer execution */ + if (!cmd->uses_heap) { + uint64_t addr = dev->rodata.geometry_state; + + /* Zeroing the allocated index frees everything */ + hk_queue_write(cmd, + addr + offsetof(struct agx_geometry_state, heap_bottom), 0, + true /* after gfx */); + + cmd->uses_heap = true; + } + + return dev->rodata.geometry_state; +} + +static uint64_t +hk_upload_gsi_params(struct hk_cmd_buffer *cmd, struct hk_draw draw) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors; + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx); + + unsigned index_size_B = + draw.indexed ? agx_index_size_to_B(draw.index_size) : 0; + + uint64_t vb; + if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) { + assert(index_size_B == 4); + + vb = desc->root.draw.tess_params + + offsetof(struct libagx_tess_args, tes_buffer); + } else { + vb = desc->root.root_desc_addr + + offsetof(struct hk_root_descriptor_table, draw.vertex_output_buffer); + } + + struct agx_gs_setup_indirect_params gsi = { + .index_buffer = draw.index.addr, + .index_size_B = index_size_B, + .index_buffer_range_el = draw.index.range / index_size_B, + .zero_sink = dev->rodata.zero_sink, + .draw = draw.b.ptr, + .vertex_buffer = vb, + .ia = desc->root.draw.input_assembly, + .geom = desc->root.draw.geometry_params, + .vs_outputs = vs->b.info.outputs, + }; + + return hk_pool_upload(cmd, &gsi, sizeof(gsi), 8); +} + +static uint64_t +hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct hk_draw draw) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + assert(!draw.b.indirect && "indirect params written by GPU"); + + struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]}; + + if (draw.indexed) { + unsigned index_size_B = agx_index_size_to_B(draw.index_size); + unsigned range_el = draw.index.range / index_size_B; + + ia.index_buffer = + libagx_index_buffer(draw.index.addr, range_el, draw.start, + index_size_B, dev->rodata.zero_sink); + + ia.index_buffer_range_el = + libagx_index_buffer_range_el(range_el, draw.start); + } + + return hk_pool_upload(cmd, &ia, sizeof(ia), 8); +} + +static enum mesa_prim +hk_gs_in_prim(struct hk_cmd_buffer *cmd) +{ + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL]; + + if (tes != NULL) + return tes->variants[HK_GS_VARIANT_RAST].info.ts.out_prim; + else + return vk_conv_topology(dyn->ia.primitive_topology); +} + +static enum mesa_prim +hk_rast_prim(struct hk_cmd_buffer *cmd) +{ + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY]; + + if (gs != NULL) + return gs->variants[HK_GS_VARIANT_RAST].info.gs.out_prim; + else + return hk_gs_in_prim(cmd); +} + +static uint64_t +hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct hk_draw draw) +{ + struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors; + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY]; + struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]); + + bool rast_disc = dyn->rs.rasterizer_discard_enable; + struct hk_shader *count = hk_count_gs_variant(gs, rast_disc); + + /* XXX: We should deduplicate this logic */ + bool restart = (draw.indexed && draw.restart); + bool indirect = + draw.b.indirect || gfx->shaders[MESA_SHADER_TESS_EVAL] || restart; + enum mesa_prim mode = hk_gs_in_prim(cmd); + + if (restart) { + mode = u_decomposed_prim(mode); + } + + struct agx_geometry_params params = { + .state = hk_geometry_state(cmd), + .indirect_desc = cmd->geom_indirect, + .flat_outputs = fs ? fs->info.fs.interp.flat : 0, + .input_topology = mode, + + /* Overriden by the indirect setup kernel. As tess->GS is always indirect, + * we can assume here that we're VS->GS. + */ + .input_buffer = desc->root.draw.vertex_output_buffer, + .input_mask = desc->root.draw.vertex_outputs, + }; + + if (gfx->xfb_enabled) { + for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb); ++i) { + params.xfb_base_original[i] = gfx->xfb[i].addr; + params.xfb_size[i] = gfx->xfb[i].range; + params.xfb_offs_ptrs[i] = gfx->xfb_offsets + i * sizeof(uint32_t); + } + } + + for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb_query); ++i) { + uint64_t q = gfx->xfb_query[i]; + + if (q) { + params.xfb_prims_generated_counter[i] = q; + params.prims_generated_counter[i] = q + sizeof(uint64_t); + } + } + + /* Calculate input primitive count for direct draws, and allocate the vertex + * & count buffers. GPU calculates and allocates for indirect draws. + */ + unsigned count_buffer_stride = count->info.gs.count_words * 4; + + if (indirect) { + params.count_buffer_stride = count_buffer_stride; + params.vs_grid[2] = params.gs_grid[2] = 1; + } else { + uint32_t verts = draw.b.count[0], instances = draw.b.count[1]; + + params.vs_grid[0] = verts; + params.gs_grid[0] = u_decomposed_prims_for_vertices(mode, verts); + + params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]); + params.input_primitives = params.gs_grid[0] * instances; + + unsigned size = params.input_primitives * count_buffer_stride; + if (size) { + params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu; + } + } + + desc->root_dirty = true; + return hk_pool_upload(cmd, ¶ms, sizeof(params), 8); +} + +/* + * Tessellation has a fast path where the tessellator generates a VDM Index List + * command per patch, as well as a slow path using prefix sums to generate a + * single combined API draw. We need the latter if tessellation is fed into + * another software stage (geometry shading), or if we need accurate primitive + * IDs in the linked fragment shader (since that would require a prefix sum + * anyway). + */ +static bool +hk_tess_needs_prefix_sum(struct hk_cmd_buffer *cmd) +{ + struct hk_graphics_state *gfx = &cmd->state.gfx; + + return gfx->shaders[MESA_SHADER_GEOMETRY] || gfx->generate_primitive_id; +} + +static uint64_t +hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct hk_draw draw) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]); + struct hk_shader *tes = hk_any_variant(gfx->shaders[MESA_SHADER_TESS_EVAL]); + + struct libagx_tess_args args = { + .heap = hk_geometry_state(cmd), + .tcs_stride_el = tcs->info.tcs.output_stride / 4, + .statistic = hk_pipeline_stat_addr( + cmd, + VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT), + + .input_patch_size = dyn->ts.patch_control_points, + .output_patch_size = tcs->info.tcs.output_patch_size, + .tcs_patch_constants = tcs->info.tcs.nr_patch_outputs, + .tcs_per_vertex_outputs = tcs->info.tcs.per_vertex_outputs, + }; + + bool with_counts = hk_tess_needs_prefix_sum(cmd); + + /* This assumes !with_counts, if we have counts it's only one draw */ + uint32_t draw_stride_el = tes->info.ts.point_mode ? 4 : 6; + size_t draw_stride_B = draw_stride_el * sizeof(uint32_t); + + /* heap is allocated by hk_geometry_state */ + args.patch_coord_buffer = dev->heap->ptr.gpu; + + if (!draw.b.indirect) { + unsigned in_patches = draw.b.count[0] / args.input_patch_size; + if (in_patches == 0) + unreachable("todo: drop the draw?"); + + unsigned unrolled_patches = in_patches * draw.b.count[1]; + + uint32_t alloc = 0; + uint32_t tcs_out_offs = alloc; + alloc += unrolled_patches * args.tcs_stride_el * 4 * 32; + + uint32_t patch_coord_offs = alloc; + alloc += unrolled_patches * 4 * 32; + + uint32_t count_offs = alloc; + if (with_counts) + alloc += unrolled_patches * sizeof(uint32_t) * 32; + + uint32_t draw_offs = alloc; + + if (with_counts) { + /* Single API draw */ + alloc += 5 * sizeof(uint32_t); + } else { + /* Padding added because VDM overreads */ + alloc += (draw_stride_B * unrolled_patches) + + (AGX_VDM_BARRIER_LENGTH + 0x800); + } + + struct agx_ptr blob = hk_pool_alloc(cmd, alloc, 4); + args.tcs_buffer = blob.gpu + tcs_out_offs; + args.patches_per_instance = in_patches; + args.coord_allocs = blob.gpu + patch_coord_offs; + args.nr_patches = unrolled_patches; + args.out_draws = blob.gpu + draw_offs; + + gfx->tess_out_draws = args.out_draws; + + if (with_counts) { + args.counts = blob.gpu + count_offs; + } else { + /* Arrange so we return after all generated draws */ + uint8_t *ret = (uint8_t *)blob.cpu + draw_offs + + (draw_stride_B * unrolled_patches); + + agx_pack(ret, VDM_BARRIER, cfg) { + cfg.returns = true; + } + } + } else { + unreachable("todo: indirect with tess"); +#if 0 + args.tcs_statistic = agx_get_query_address( + batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]); + + args.indirect = agx_indirect_buffer_ptr(batch, indirect); + + /* Allocate 3x indirect global+local grids for VS/TCS/tess */ + uint32_t grid_stride = sizeof(uint32_t) * 6; + args.grids = agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu; + + vs_grid = agx_grid_indirect_local(args.grids + 0 * grid_stride); + tcs_grid = agx_grid_indirect_local(args.grids + 1 * grid_stride); + tess_grid = agx_grid_indirect_local(args.grids + 2 * grid_stride); + + args.vertex_outputs = ctx->vs->b.info.outputs; + args.vertex_output_buffer_ptr = + agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu; + + batch->uniforms.vertex_output_buffer_ptr = args.vertex_output_buffer_ptr; + + if (with_counts) { + args.out_draws = agx_pool_alloc_aligned_with_bo( + &batch->pool, draw_stride, 4, &draw_bo) + .gpu; + } else { + unreachable("need an extra indirection..."); + } +#endif + } + + return hk_pool_upload(cmd, &args, sizeof(args), 8); +} + +static struct hk_api_shader * +hk_build_meta_shader_locked(struct hk_device *dev, struct hk_internal_key *key, + hk_internal_builder_t builder) +{ + /* Try to get the cached shader */ + struct hash_entry *ent = _mesa_hash_table_search(dev->kernels.ht, key); + if (ent) + return ent->data; + + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, + &agx_nir_options, NULL); + builder(&b, key->key); + + const struct vk_pipeline_robustness_state rs = { + .images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT, + .storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT, + .uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT, + .vertex_inputs = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT, + }; + + struct vk_shader_compile_info info = { + .stage = b.shader->info.stage, + .nir = b.shader, + .robustness = &rs, + }; + + /* We need to link libagx and assign shared before preprocessing, matching + * what the driver would otherwise produce. + */ + agx_link_libagx(b.shader, dev->dev.libagx); + + if (info.stage == MESA_SHADER_COMPUTE) { + NIR_PASS(_, b.shader, nir_lower_vars_to_explicit_types, + nir_var_mem_shared, glsl_get_cl_type_size_align); + + /* Commit to the layout so we don't clobber later */ + b.shader->info.shared_memory_explicit_layout = true; + + NIR_PASS(_, b.shader, nir_lower_explicit_io, nir_var_mem_shared, + nir_address_format_62bit_generic); + } + + hk_preprocess_nir_internal(dev->vk.physical, b.shader); + + struct hk_api_shader *s; + if (hk_compile_shader(dev, &info, NULL, NULL, &s) != VK_SUCCESS) + return NULL; + + /* ..and cache it before we return. The key is on the stack right now, so + * clone it before using it as a hash table key. The clone is logically owned + * by the hash table. + */ + size_t total_key_size = sizeof(*key) + key->key_size; + void *cloned_key = ralloc_memdup(dev->kernels.ht, key, total_key_size); + + _mesa_hash_table_insert(dev->kernels.ht, cloned_key, s); + return s; +} + +struct hk_api_shader * +hk_meta_shader(struct hk_device *dev, hk_internal_builder_t builder, void *data, + size_t data_size) +{ + size_t total_key_size = sizeof(struct hk_internal_key) + data_size; + + struct hk_internal_key *key = alloca(total_key_size); + key->builder = builder; + key->key_size = data_size; + + if (data_size) + memcpy(key->key, data, data_size); + + simple_mtx_lock(&dev->kernels.lock); + struct hk_api_shader *s = hk_build_meta_shader_locked(dev, key, builder); + simple_mtx_unlock(&dev->kernels.lock); + + return s; +} + +static struct hk_draw +hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct hk_cs *cs, + struct hk_draw draw, uint32_t draw_count) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + + perf_debug(dev, "Unrolling primitive restart due to GS/XFB"); + + /* The unroll kernel assumes an indirect draw. Synthesize one if needed */ + if (!draw.b.indirect) { + uint32_t desc[5] = {draw.b.count[0], draw.b.count[1], draw.start, + draw.index_bias, draw.start_instance}; + + draw = + hk_draw_indexed_indirect(hk_pool_upload(cmd, desc, sizeof(desc), 4), + draw.index, draw.index_size, true); + } + + /* Next, we unroll the index buffer used by the indirect draw */ + struct agx_unroll_restart_key key = { + .prim = vk_conv_topology(dyn->ia.primitive_topology), + .index_size_B = agx_index_size_to_B(draw.index_size), + }; + + struct agx_restart_unroll_params ia = { + .heap = hk_geometry_state(cmd), + .index_buffer = draw.index.addr, + .count = hk_pool_upload(cmd, &draw_count, sizeof(uint32_t), 4), + .draws = draw.b.ptr, + .out_draws = hk_pool_alloc(cmd, 5 * sizeof(uint32_t) * draw_count, 4).gpu, + .max_draws = 1 /* TODO: MDI */, + .restart_index = gfx->index.restart, + .index_buffer_size_el = draw.index.range / key.index_size_B, + .flatshade_first = + dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT, + .zero_sink = dev->rodata.zero_sink, + }; + + struct hk_shader *s = + hk_meta_kernel(dev, agx_nir_unroll_restart, &key, sizeof(key)); + + uint64_t params = hk_pool_upload(cmd, &ia, sizeof(ia), 8); + uint32_t usc = hk_upload_usc_words_kernel(cmd, s, ¶ms, sizeof(params)); + hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1024 * draw_count, 1, 1), + hk_grid(1024, 1, 1)); + + struct hk_addr_range out_index = { + .addr = dev->heap->ptr.gpu, + .range = dev->heap->size, + }; + + return hk_draw_indexed_indirect(ia.out_draws, out_index, draw.index_size, + false /* restart */); +} + +static struct hk_draw +hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs, + struct hk_draw draw) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors; + struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY]; + struct hk_grid grid_vs, grid_gs; + + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + bool rast_disc = dyn->rs.rasterizer_discard_enable; + + hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/); + + struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx); + struct hk_shader *main = hk_main_gs_variant(gs, rast_disc); + struct hk_shader *count = hk_count_gs_variant(gs, rast_disc); + struct hk_shader *pre_gs = hk_pre_gs_variant(gs, rast_disc); + + unsigned count_words = count->info.gs.count_words; + + if (false /* TODO */) + perf_debug(dev, "Transform feedbck"); + else if (count_words) + perf_debug(dev, "Geometry shader with counts"); + else + perf_debug(dev, "Geometry shader without counts"); + + enum mesa_prim mode = hk_gs_in_prim(cmd); + + if (draw.indexed && draw.restart) { + draw = hk_draw_without_restart(cmd, cs, draw, 1); + mode = u_decomposed_prim(mode); + } + + /* Setup grids */ + if (draw.b.indirect) { + struct agx_gs_setup_indirect_key key = {.prim = mode}; + + struct hk_shader *gsi = + hk_meta_kernel(dev, agx_nir_gs_setup_indirect, &key, sizeof(key)); + + uint64_t push = hk_upload_gsi_params(cmd, draw); + uint32_t usc = hk_upload_usc_words_kernel(cmd, gsi, &push, sizeof(push)); + + hk_dispatch_with_usc(dev, cs, gsi, usc, hk_grid(1, 1, 1), + hk_grid(1, 1, 1)); + + uint64_t geometry_params = desc->root.draw.geometry_params; + grid_vs = hk_grid_indirect(geometry_params + + offsetof(struct agx_geometry_params, vs_grid)); + + grid_gs = hk_grid_indirect(geometry_params + + offsetof(struct agx_geometry_params, gs_grid)); + } else { + grid_vs = grid_gs = draw.b; + grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]); + } + + /* Launch the vertex shader first */ + hk_reserve_scratch(cmd, cs, vs); + hk_dispatch_with_usc(dev, cs, vs, + hk_upload_usc_words(cmd, vs, + vs->info.stage == MESA_SHADER_VERTEX + ? gfx->linked[MESA_SHADER_VERTEX] + : vs->only_linked), + grid_vs, hk_grid(1, 1, 1)); + + /* If we need counts, launch the count shader and prefix sum the results. */ + if (count_words) { + hk_dispatch_with_local_size(cmd, cs, count, grid_gs, hk_grid(1, 1, 1)); + + struct hk_api_shader *prefix_sum = hk_meta_shader( + dev, agx_nir_prefix_sum_gs, &count_words, sizeof(count_words)); + + /* XXX: hack */ + hk_only_variant(prefix_sum)->info.stage = MESA_SHADER_GEOMETRY; + + hk_dispatch_with_local_size(cmd, cs, hk_only_variant(prefix_sum), + hk_grid(1024 * count_words, 1, 1), + hk_grid(1024, 1, 1)); + } + + /* Pre-GS shader */ + hk_dispatch_with_local_size(cmd, cs, pre_gs, hk_grid(1, 1, 1), + hk_grid(1, 1, 1)); + + /* Pre-rast geometry shader */ + hk_dispatch_with_local_size(cmd, cs, main, grid_gs, hk_grid(1, 1, 1)); + + struct hk_addr_range range = (struct hk_addr_range){ + .addr = dev->heap->ptr.gpu, + .range = dev->heap->size, + }; + + bool restart = cmd->state.gfx.topology != AGX_PRIMITIVE_POINTS; + return hk_draw_indexed_indirect(cmd->geom_indirect, range, + AGX_INDEX_SIZE_U32, restart); +} + +static struct hk_draw +hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_draw draw) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_grid grid_vs, grid_tcs, grid_tess; + + struct hk_shader *vs = hk_bound_sw_vs(gfx); + struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]); + struct hk_shader *tes = hk_any_variant(gfx->shaders[MESA_SHADER_TESS_EVAL]); + + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + uint32_t input_patch_size = dyn->ts.patch_control_points; + + hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/); + + perf_debug(dev, "Tessellation"); + + uint64_t tcs_stat = hk_pipeline_stat_addr( + cmd, VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT); + + /* Setup grids */ + if (draw.b.indirect) { + unreachable("todo: indirect tess"); +#if 0 + struct agx_gs_setup_indirect_key key = {.prim = mode}; + + struct hk_shader *gsi = + hk_meta_kernel(dev, agx_nir_gs_setup_indirect, &key, sizeof(key)); + + uint64_t push = hk_upload_gsi_params(cmd, draw); + uint32_t usc = hk_upload_usc_words_kernel(cmd, gsi, &push, sizeof(push)); + + hk_dispatch_with_usc(dev, cs, gsi, usc, hk_grid(1, 1, 1), + hk_grid(1, 1, 1)); + + uint64_t geometry_params = desc->root.draw.geometry_params; + grid_vs = hk_grid_indirect(geometry_params + + offsetof(struct agx_geometry_params, vs_grid)); + + grid_gs = hk_grid_indirect(geometry_params + + offsetof(struct agx_geometry_params, gs_grid)); +#endif + } else { + uint32_t patches = draw.b.count[0] / input_patch_size; + grid_vs = grid_tcs = draw.b; + + grid_tcs.count[0] = patches * tcs->info.tcs.output_patch_size; + grid_tess = hk_grid(patches * draw.b.count[1], 1, 1); + + /* TCS invocation counter increments once per-patch */ + if (tcs_stat) { + perf_debug(dev, "Direct TCS statistic"); + + struct libagx_increment_params args = { + .statistic = tcs_stat, + .delta = patches, + }; + + struct hk_shader *s = + hk_meta_kernel(dev, agx_nir_increment_statistic, NULL, 0); + + uint64_t push = hk_pool_upload(cmd, &args, sizeof(args), 8); + uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push)); + + hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1), + hk_grid(1, 1, 1)); + } + } + + /* First launch the VS and TCS */ + hk_reserve_scratch(cmd, cs, vs); + hk_reserve_scratch(cmd, cs, tcs); + + /* XXX perf: grid size */ + hk_dispatch_with_usc( + dev, cs, vs, + hk_upload_usc_words(cmd, vs, gfx->linked[MESA_SHADER_VERTEX]), grid_vs, + hk_grid(64, 1, 1)); + + hk_dispatch_with_usc( + dev, cs, tcs, hk_upload_usc_words(cmd, tcs, tcs->only_linked), grid_tcs, + hk_grid(tcs->info.tcs.output_patch_size, 1, 1)); + + /* TODO indirect */ + + bool with_counts = hk_tess_needs_prefix_sum(cmd); + uint64_t state = gfx->descriptors.root.draw.tess_params; + + /* If the domain is flipped, we need to flip the winding order */ + bool ccw = tes->info.ts.ccw; + ccw ^= dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT; + + enum libagx_tess_partitioning partitioning = + tes->info.ts.spacing == TESS_SPACING_EQUAL + ? LIBAGX_TESS_PARTITIONING_INTEGER + : tes->info.ts.spacing == TESS_SPACING_FRACTIONAL_ODD + ? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD + : LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN; + + enum libagx_tess_output_primitive prim = + tes->info.ts.point_mode ? LIBAGX_TESS_OUTPUT_POINT + : ccw ? LIBAGX_TESS_OUTPUT_TRIANGLE_CCW + : LIBAGX_TESS_OUTPUT_TRIANGLE_CW; + + struct agx_tessellator_key key = { + .prim = tes->info.ts.mode, + .output_primitive = prim, + .partitioning = partitioning, + }; + + if (with_counts) { + perf_debug(dev, "Tessellation with counts"); + + /* Generate counts */ + key.mode = LIBAGX_TESS_MODE_COUNT; + { + struct hk_shader *tess = + hk_meta_kernel(dev, agx_nir_tessellate, &key, sizeof(key)); + + hk_dispatch_with_usc( + dev, cs, tess, + hk_upload_usc_words_kernel(cmd, tess, &state, sizeof(state)), + grid_tess, hk_grid(64, 1, 1)); + } + + /* Prefix sum counts, allocating index buffer space. */ + { + struct hk_shader *sum = + hk_meta_kernel(dev, agx_nir_prefix_sum_tess, NULL, 0); + + hk_dispatch_with_usc( + dev, cs, sum, + hk_upload_usc_words_kernel(cmd, sum, &state, sizeof(state)), + hk_grid(1024, 1, 1), hk_grid(1024, 1, 1)); + } + + key.mode = LIBAGX_TESS_MODE_WITH_COUNTS; + } else { + key.mode = LIBAGX_TESS_MODE_VDM; + } + + /* Now we can tessellate */ + { + struct hk_shader *tess = + hk_meta_kernel(dev, agx_nir_tessellate, &key, sizeof(key)); + + hk_dispatch_with_usc( + dev, cs, tess, + hk_upload_usc_words_kernel(cmd, tess, &state, sizeof(state)), + grid_tess, hk_grid(64, 1, 1)); + } + + struct hk_addr_range range = (struct hk_addr_range){ + .addr = dev->heap->ptr.gpu, + .range = dev->heap->size, + }; + + struct hk_draw out = hk_draw_indexed_indirect(gfx->tess_out_draws, range, + AGX_INDEX_SIZE_U32, false); + out.raw = !with_counts; + return out; +} + +void +hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd, + const gl_shader_stage stage, + struct hk_api_shader *shader) +{ + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + + assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders)); + if (cmd->state.gfx.shaders[stage] == shader) + return; + + cmd->state.gfx.shaders[stage] = shader; + cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage); + + if (stage == MESA_SHADER_FRAGMENT) { + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES); + } +} + +static uint32_t +hk_pipeline_bind_group(gl_shader_stage stage) +{ + return stage; +} + +static void +hk_flush_shaders(struct hk_cmd_buffer *cmd) +{ + if (cmd->state.gfx.shaders_dirty == 0) + return; + + /* Map shader types to shaders */ + struct hk_api_shader *type_shader[6] = { + NULL, + }; + uint32_t types_dirty = 0; + + const uint32_t gfx_stages = + BITFIELD_BIT(MESA_SHADER_VERTEX) | BITFIELD_BIT(MESA_SHADER_TESS_CTRL) | + BITFIELD_BIT(MESA_SHADER_TESS_EVAL) | BITFIELD_BIT(MESA_SHADER_GEOMETRY) | + BITFIELD_BIT(MESA_SHADER_FRAGMENT); + + /* Geometry shading overrides the restart index, reemit on rebind */ + if (IS_SHADER_DIRTY(GEOMETRY)) { + cmd->state.gfx.dirty |= HK_DIRTY_INDEX; + } + + u_foreach_bit(stage, cmd->state.gfx.shaders_dirty & gfx_stages) { + /* TODO: compact? */ + uint32_t type = stage; + types_dirty |= BITFIELD_BIT(type); + + /* Only copy non-NULL shaders because mesh/task alias with vertex and + * tessellation stages. + */ + if (cmd->state.gfx.shaders[stage] != NULL) { + assert(type < ARRAY_SIZE(type_shader)); + assert(type_shader[type] == NULL); + type_shader[type] = cmd->state.gfx.shaders[stage]; + } + } + + u_foreach_bit(type, types_dirty) { + struct hk_api_shader *shader = type_shader[type]; + + /* We always map index == type */ + // const uint32_t idx = type; + + if (shader == NULL) + continue; + + /* TODO */ + } + + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_shader *hw_vs = hk_bound_hw_vs(gfx); + struct hk_api_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT]; + + /* If we have a new VS/FS pair, UVS locations may have changed so need to + * relink. We do this here because there's no dependence on the fast linked + * shaders. + */ + agx_assign_uvs(&gfx->linked_varyings, &hw_vs->info.uvs, + fs ? hk_only_variant(fs)->info.fs.interp.flat : 0, + fs ? hk_only_variant(fs)->info.fs.interp.linear : 0); + + struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors; + desc->root_dirty = true; + + for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) { + desc->root.draw.uvs_index[i] = gfx->linked_varyings.slots[i]; + } +} + +static struct agx_shader_part * +hk_get_prolog_epilog_locked(struct hk_device *dev, struct hk_internal_key *key, + hk_internal_builder_t builder, bool preprocess_nir, + bool stop, unsigned cf_base) +{ + /* Try to get the cached shader */ + struct hash_entry *ent = _mesa_hash_table_search(dev->prolog_epilog.ht, key); + if (ent) + return ent->data; + + nir_builder b = nir_builder_init_simple_shader(0, &agx_nir_options, NULL); + builder(&b, key->key); + + if (preprocess_nir) + agx_preprocess_nir(b.shader, dev->dev.libagx); + + struct agx_shader_key backend_key = { + .needs_g13x_coherency = (dev->dev.params.gpu_generation == 13 && + dev->dev.params.num_clusters_total > 1) || + dev->dev.params.num_dies > 1, + .libagx = dev->dev.libagx, + .secondary = true, + .no_stop = !stop, + }; + + /* We always use dynamic sample shading in the GL driver. Indicate that. */ + if (b.shader->info.stage == MESA_SHADER_FRAGMENT) { + backend_key.fs.cf_base = cf_base; + + if (b.shader->info.fs.uses_sample_shading) + backend_key.fs.inside_sample_loop = true; + } + + struct agx_shader_part *part = + rzalloc(dev->prolog_epilog.ht, struct agx_shader_part); + + agx_compile_shader_nir(b.shader, &backend_key, NULL, part); + + ralloc_free(b.shader); + + /* ..and cache it before we return. The key is on the stack right now, so + * clone it before using it as a hash table key. The clone is logically owned + * by the hash table. + */ + size_t total_key_size = sizeof(*key) + key->key_size; + void *cloned_key = ralloc_memdup(dev->prolog_epilog.ht, key, total_key_size); + + _mesa_hash_table_insert(dev->prolog_epilog.ht, cloned_key, part); + return part; +} + +static struct agx_shader_part * +hk_get_prolog_epilog(struct hk_device *dev, void *data, size_t data_size, + hk_internal_builder_t builder, bool preprocess_nir, + bool stop, unsigned cf_base) +{ + /* Build the meta shader key */ + size_t total_key_size = sizeof(struct hk_internal_key) + data_size; + + struct hk_internal_key *key = alloca(total_key_size); + key->builder = builder; + key->key_size = data_size; + + if (data_size) + memcpy(key->key, data, data_size); + + simple_mtx_lock(&dev->prolog_epilog.lock); + + struct agx_shader_part *part = hk_get_prolog_epilog_locked( + dev, key, builder, preprocess_nir, stop, cf_base); + + simple_mtx_unlock(&dev->prolog_epilog.lock); + return part; +} + +static struct hk_linked_shader * +hk_get_fast_linked_locked_vs(struct hk_device *dev, struct hk_shader *shader, + struct hk_fast_link_key_vs *key) +{ + struct agx_shader_part *prolog = + hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog), + agx_nir_vs_prolog, false, false, 0); + + struct hk_linked_shader *linked = + hk_fast_link(dev, false, shader, prolog, NULL, 0); + + struct hk_fast_link_key *key_clone = + ralloc_memdup(shader->linked.ht, key, sizeof(*key)); + + /* XXX: Fix this higher up the stack */ + linked->b.uses_base_param |= !key->prolog.hw; + + _mesa_hash_table_insert(shader->linked.ht, key_clone, linked); + return linked; +} + +static void +build_fs_prolog(nir_builder *b, const void *key) +{ + agx_nir_fs_prolog(b, key); + + /* Lower load_stat_query_address_agx, needed for FS statistics */ + NIR_PASS(_, b->shader, hk_lower_uvs_index, 0); +} + +static struct hk_linked_shader * +hk_get_fast_linked_locked_fs(struct hk_device *dev, struct hk_shader *shader, + struct hk_fast_link_key_fs *key) +{ + /* TODO: prolog without fs needs to work too... */ + bool needs_prolog = key->prolog.statistics || + key->prolog.cull_distance_size || + key->prolog.api_sample_mask != 0xff; + + struct agx_shader_part *prolog = NULL; + if (needs_prolog) { + prolog = hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog), + build_fs_prolog, false, false, + key->prolog.cf_base); + } + + /* If sample shading is used, don't stop at the epilog, there's a + * footer that the fast linker will insert to stop. + */ + bool epilog_stop = (key->nr_samples_shaded == 0); + + struct agx_shader_part *epilog = + hk_get_prolog_epilog(dev, &key->epilog, sizeof(key->epilog), + agx_nir_fs_epilog, true, epilog_stop, 0); + + struct hk_linked_shader *linked = + hk_fast_link(dev, true, shader, prolog, epilog, key->nr_samples_shaded); + + struct hk_fast_link_key *key_clone = + ralloc_memdup(shader->linked.ht, key, sizeof(*key)); + + _mesa_hash_table_insert(shader->linked.ht, key_clone, linked); + return linked; +} + +/* + * First, look for a fully linked variant. Else, build the required shader + * parts and link. + */ +static struct hk_linked_shader * +hk_get_fast_linked(struct hk_device *dev, struct hk_shader *shader, void *key) +{ + struct hk_linked_shader *linked; + simple_mtx_lock(&shader->linked.lock); + + struct hash_entry *ent = _mesa_hash_table_search(shader->linked.ht, key); + + if (ent) + linked = ent->data; + else if (shader->info.stage == MESA_SHADER_VERTEX) + linked = hk_get_fast_linked_locked_vs(dev, shader, key); + else if (shader->info.stage == MESA_SHADER_FRAGMENT) + linked = hk_get_fast_linked_locked_fs(dev, shader, key); + else + unreachable("invalid stage"); + + simple_mtx_unlock(&shader->linked.lock); + return linked; +} + +static void +hk_update_fast_linked(struct hk_cmd_buffer *cmd, struct hk_shader *shader, + void *key) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + struct hk_linked_shader *new = hk_get_fast_linked(dev, shader, key); + gl_shader_stage stage = shader->info.stage; + + if (cmd->state.gfx.linked[stage] != new) { + cmd->state.gfx.linked[stage] = new; + cmd->state.gfx.linked_dirty |= BITFIELD_BIT(stage); + } +} + +static enum agx_polygon_mode +translate_polygon_mode(VkPolygonMode vk_mode) +{ + static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_FILL == + AGX_POLYGON_MODE_FILL); + static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_LINE == + AGX_POLYGON_MODE_LINE); + static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_POINT == + AGX_POLYGON_MODE_POINT); + + assert(vk_mode <= VK_POLYGON_MODE_POINT); + return (enum agx_polygon_mode)vk_mode; +} + +static enum agx_zs_func +translate_compare_op(VkCompareOp vk_mode) +{ + static_assert((enum agx_zs_func)VK_COMPARE_OP_NEVER == AGX_ZS_FUNC_NEVER); + static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS == AGX_ZS_FUNC_LESS); + static_assert((enum agx_zs_func)VK_COMPARE_OP_EQUAL == AGX_ZS_FUNC_EQUAL); + static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS_OR_EQUAL == + AGX_ZS_FUNC_LEQUAL); + static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER == + AGX_ZS_FUNC_GREATER); + static_assert((enum agx_zs_func)VK_COMPARE_OP_NOT_EQUAL == + AGX_ZS_FUNC_NOT_EQUAL); + static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER_OR_EQUAL == + AGX_ZS_FUNC_GEQUAL); + static_assert((enum agx_zs_func)VK_COMPARE_OP_ALWAYS == AGX_ZS_FUNC_ALWAYS); + + assert(vk_mode <= VK_COMPARE_OP_ALWAYS); + return (enum agx_zs_func)vk_mode; +} + +static enum agx_stencil_op +translate_stencil_op(VkStencilOp vk_op) +{ + static_assert((enum agx_stencil_op)VK_STENCIL_OP_KEEP == + AGX_STENCIL_OP_KEEP); + static_assert((enum agx_stencil_op)VK_STENCIL_OP_ZERO == + AGX_STENCIL_OP_ZERO); + static_assert((enum agx_stencil_op)VK_STENCIL_OP_REPLACE == + AGX_STENCIL_OP_REPLACE); + static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_CLAMP == + AGX_STENCIL_OP_INCR_SAT); + static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_CLAMP == + AGX_STENCIL_OP_DECR_SAT); + static_assert((enum agx_stencil_op)VK_STENCIL_OP_INVERT == + AGX_STENCIL_OP_INVERT); + static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_WRAP == + AGX_STENCIL_OP_INCR_WRAP); + static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_WRAP == + AGX_STENCIL_OP_DECR_WRAP); + + return (enum agx_stencil_op)vk_op; +} + +static void +hk_ppp_push_stencil_face(struct agx_ppp_update *ppp, + struct vk_stencil_test_face_state s, bool enabled) +{ + if (enabled) { + agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) { + cfg.compare = translate_compare_op(s.op.compare); + cfg.write_mask = s.write_mask; + cfg.read_mask = s.compare_mask; + + cfg.depth_pass = translate_stencil_op(s.op.pass); + cfg.depth_fail = translate_stencil_op(s.op.depth_fail); + cfg.stencil_fail = translate_stencil_op(s.op.fail); + } + } else { + agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) { + cfg.compare = AGX_ZS_FUNC_ALWAYS; + cfg.write_mask = 0xFF; + cfg.read_mask = 0xFF; + + cfg.depth_pass = AGX_STENCIL_OP_KEEP; + cfg.depth_fail = AGX_STENCIL_OP_KEEP; + cfg.stencil_fail = AGX_STENCIL_OP_KEEP; + } + } +} + +static bool +hk_stencil_test_enabled(struct hk_cmd_buffer *cmd) +{ + const struct hk_rendering_state *render = &cmd->state.gfx.render; + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + + return dyn->ds.stencil.test_enable && + render->stencil_att.vk_format != VK_FORMAT_UNDEFINED; +} + +static void +hk_flush_vp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out) +{ + const struct vk_dynamic_graphics_state *dyn = + &cmd->vk.dynamic_graphics_state; + + /* We always need at least 1 viewport for the hardware. With rasterizer + * discard the app may not supply any, but we can just program garbage. + */ + unsigned count = MAX2(dyn->vp.viewport_count, 1); + + unsigned minx[HK_MAX_VIEWPORTS] = {0}, miny[HK_MAX_VIEWPORTS] = {0}; + unsigned maxx[HK_MAX_VIEWPORTS] = {0}, maxy[HK_MAX_VIEWPORTS] = {0}; + + /* We implicitly scissor to the viewport. We need to do a min/max dance to + * handle inverted viewports. + */ + for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) { + const VkViewport *vp = &dyn->vp.viewports[i]; + + minx[i] = MIN2(vp->x, vp->x + vp->width); + miny[i] = MIN2(vp->y, vp->y + vp->height); + maxx[i] = MAX2(vp->x, vp->x + vp->width); + maxy[i] = MAX2(vp->y, vp->y + vp->height); + } + + /* Additionally clamp to the framebuffer so we don't rasterize + * off-screen pixels. TODO: Is this necessary? the GL driver does this but + * it might be cargoculted at this point. + * + * which is software-visible and can cause faults with + * eMRT when the framebuffer is not a multiple of the tile size. + */ + for (unsigned i = 0; i < count; ++i) { + minx[i] = MIN2(minx[i], cmd->state.gfx.render.cr.width); + maxx[i] = MIN2(maxx[i], cmd->state.gfx.render.cr.width); + miny[i] = MIN2(miny[i], cmd->state.gfx.render.cr.height); + maxy[i] = MIN2(maxy[i], cmd->state.gfx.render.cr.height); + } + + /* We additionally apply any API scissors */ + for (unsigned i = 0; i < dyn->vp.scissor_count; ++i) { + const VkRect2D *s = &dyn->vp.scissors[i]; + + minx[i] = MAX2(minx[i], s->offset.x); + miny[i] = MAX2(miny[i], s->offset.y); + maxx[i] = MIN2(maxx[i], s->offset.x + s->extent.width); + maxy[i] = MIN2(maxy[i], s->offset.y + s->extent.height); + } + + /* Upload a hardware scissor for each viewport, whether there's a + * corresponding API scissor or not. + */ + unsigned index = cs->scissor.size / AGX_SCISSOR_LENGTH; + struct agx_scissor_packed *scissors = + util_dynarray_grow_bytes(&cs->scissor, count, AGX_SCISSOR_LENGTH); + + for (unsigned i = 0; i < count; ++i) { + const VkViewport *vp = &dyn->vp.viewports[i]; + + agx_pack(scissors + i, SCISSOR, cfg) { + cfg.min_x = minx[i]; + cfg.min_y = miny[i]; + cfg.max_x = maxx[i]; + cfg.max_y = maxy[i]; + + /* These settings in conjunction with the PPP control depth clip/clamp + * settings implement depth clip/clamping. Properly setting them + * together is required for conformant depth clip enable. + * + * TODO: Reverse-engineer the finer interactions here. + */ + if (dyn->rs.depth_clamp_enable) { + cfg.min_z = MIN2(vp->minDepth, vp->maxDepth); + cfg.max_z = MAX2(vp->minDepth, vp->maxDepth); + } else { + cfg.min_z = 0.0; + cfg.max_z = 1.0; + } + } + } + + /* Upload state */ + struct AGX_PPP_HEADER present = { + .depth_bias_scissor = true, + .region_clip = true, + .viewport = true, + .viewport_count = count, + }; + + size_t size = agx_ppp_update_size(&present); + struct agx_ptr T = hk_pool_alloc(cmd, size, 64); + if (!T.cpu) + return; + + struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present); + + agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) { + cfg.scissor = index; + + /* Use the current depth bias, we allocate linearly */ + unsigned count = cs->depth_bias.size / AGX_DEPTH_BIAS_LENGTH; + cfg.depth_bias = count ? count - 1 : 0; + }; + + for (unsigned i = 0; i < count; ++i) { + agx_ppp_push(&ppp, REGION_CLIP, cfg) { + cfg.enable = true; + cfg.min_x = minx[i] / 32; + cfg.min_y = miny[i] / 32; + cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32); + cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32); + } + } + + agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg) + ; + + /* Upload viewports */ + for (unsigned i = 0; i < count; ++i) { + const VkViewport *vp = &dyn->vp.viewports[i]; + + agx_ppp_push(&ppp, VIEWPORT, cfg) { + cfg.translate_x = vp->x + 0.5f * vp->width; + cfg.translate_y = vp->y + 0.5f * vp->height; + cfg.translate_z = vp->minDepth; + + cfg.scale_x = vp->width * 0.5f; + cfg.scale_y = vp->height * 0.5f; + cfg.scale_z = vp->maxDepth - vp->minDepth; + } + } + + agx_ppp_fini(out, &ppp); +} + +static enum agx_object_type +translate_object_type(enum mesa_prim topology) +{ + static_assert(MESA_PRIM_LINES < MESA_PRIM_LINE_STRIP); + static_assert(MESA_PRIM_TRIANGLES >= MESA_PRIM_LINE_STRIP); + + if (topology == MESA_PRIM_POINTS) + return AGX_OBJECT_TYPE_POINT_SPRITE_UV01; + else if (topology <= MESA_PRIM_LINE_STRIP) + return AGX_OBJECT_TYPE_LINE; + else + return AGX_OBJECT_TYPE_TRIANGLE; +} + +static enum agx_primitive +translate_hw_primitive_topology(enum mesa_prim prim) +{ + switch (prim) { + case MESA_PRIM_POINTS: + return AGX_PRIMITIVE_POINTS; + case MESA_PRIM_LINES: + return AGX_PRIMITIVE_LINES; + case MESA_PRIM_LINE_STRIP: + return AGX_PRIMITIVE_LINE_STRIP; + case MESA_PRIM_TRIANGLES: + return AGX_PRIMITIVE_TRIANGLES; + case MESA_PRIM_TRIANGLE_STRIP: + return AGX_PRIMITIVE_TRIANGLE_STRIP; + case MESA_PRIM_TRIANGLE_FAN: + return AGX_PRIMITIVE_TRIANGLE_FAN; + default: + unreachable("Invalid hardware primitive topology"); + } +} + +static inline enum agx_vdm_vertex +translate_vdm_vertex(unsigned vtx) +{ + static_assert(AGX_VDM_VERTEX_0 == 0); + static_assert(AGX_VDM_VERTEX_1 == 1); + static_assert(AGX_VDM_VERTEX_2 == 2); + + assert(vtx <= 2); + return vtx; +} + +static inline enum agx_ppp_vertex +translate_ppp_vertex(unsigned vtx) +{ + static_assert(AGX_PPP_VERTEX_0 == 0 + 1); + static_assert(AGX_PPP_VERTEX_1 == 1 + 1); + static_assert(AGX_PPP_VERTEX_2 == 2 + 1); + + assert(vtx <= 2); + return vtx + 1; +} + +static void +hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs) +{ + uint8_t *out = cs->current; + agx_push(out, VDM_STATE, cfg) { + cfg.restart_index_present = true; + } + + agx_push(out, VDM_STATE_RESTART_INDEX, cfg) { + if (cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY]) + cfg.value = BITFIELD_MASK(32); + else + cfg.value = cmd->state.gfx.index.restart; + } + + cs->current = out; +} + +/* + * Return the given sample positions, packed into a 32-bit word with fixed + * point nibbles for each x/y component of the (at most 4) samples. This is + * suitable for programming the PPP_MULTISAMPLECTL control register. + */ +static uint32_t +hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state *sl) +{ + uint32_t ctrl = 0; + + for (int32_t i = sl->per_pixel - 1; i >= 0; i--) { + VkSampleLocationEXT loc = sl->locations[i]; + + uint32_t x = CLAMP(loc.x, 0.0f, 0.9375f) * 16.0; + uint32_t y = CLAMP(loc.y, 0.0f, 0.9375f) * 16.0; + + assert(x <= 15); + assert(y <= 15); + + /* Push bytes in reverse order so we can use constant shifts. */ + ctrl = (ctrl << 8) | (y << 4) | x; + } + + return ctrl; +} + +/* + * Return the standard sample positions, prepacked as above for efficiency. + */ +uint32_t +hk_default_sample_positions(unsigned nr_samples) +{ + switch (nr_samples) { + case 0: + case 1: + return 0x88; + case 2: + return 0x44cc; + case 4: + return 0xeaa26e26; + default: + unreachable("Invalid sample count"); + } +} + +static void +hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out) +{ + const struct hk_rendering_state *render = &cmd->state.gfx.render; + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_shader *hw_vs = hk_bound_hw_vs(gfx); + struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]); + + bool hw_vs_dirty = IS_SHADER_DIRTY(VERTEX) || IS_SHADER_DIRTY(TESS_EVAL) || + IS_SHADER_DIRTY(GEOMETRY); + bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT); + + struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT]; + bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT); + + bool varyings_dirty = gfx->dirty & HK_DIRTY_VARYINGS; + + bool face_dirty = + IS_DIRTY(DS_DEPTH_TEST_ENABLE) || IS_DIRTY(DS_DEPTH_WRITE_ENABLE) || + IS_DIRTY(DS_DEPTH_COMPARE_OP) || IS_DIRTY(DS_STENCIL_REFERENCE) || + IS_DIRTY(RS_LINE_WIDTH) || IS_DIRTY(RS_POLYGON_MODE) || fs_dirty; + + bool stencil_face_dirty = + IS_DIRTY(DS_STENCIL_OP) || IS_DIRTY(DS_STENCIL_COMPARE_MASK) || + IS_DIRTY(DS_STENCIL_WRITE_MASK) || IS_DIRTY(DS_STENCIL_TEST_ENABLE); + + struct AGX_PPP_HEADER dirty = { + .fragment_control = + IS_DIRTY(DS_STENCIL_TEST_ENABLE) || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || + IS_DIRTY(RS_DEPTH_BIAS_ENABLE) || gfx->dirty & HK_DIRTY_OCCLUSION, + + .fragment_control_2 = + IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_fs_dirty, + + .fragment_front_face = face_dirty, + .fragment_front_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY), + .fragment_front_stencil = stencil_face_dirty, + .fragment_back_face = face_dirty, + .fragment_back_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY), + .fragment_back_stencil = stencil_face_dirty, + .output_select = hw_vs_dirty || linked_fs_dirty || varyings_dirty, + .varying_counts_32 = varyings_dirty, + .varying_counts_16 = varyings_dirty, + .cull = + IS_DIRTY(RS_CULL_MODE) || IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || + IS_DIRTY(RS_FRONT_FACE) || IS_DIRTY(RS_DEPTH_CLIP_ENABLE) || + IS_DIRTY(RS_DEPTH_CLAMP_ENABLE) || IS_DIRTY(RS_LINE_MODE) || + IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || (gfx->dirty & HK_DIRTY_PROVOKING), + .cull_2 = varyings_dirty, + + /* With a null FS, the fragment shader PPP word is ignored and doesn't + * need to be present. + */ + .fragment_shader = fs && (fs_dirty || linked_fs_dirty || varyings_dirty || + gfx->descriptors.root_dirty), + + .occlusion_query = gfx->dirty & HK_DIRTY_OCCLUSION, + .output_size = hw_vs_dirty, + .viewport_count = 1, /* irrelevant */ + }; + + /* Calculate the update size. If it equals the header, there is nothing to + * update so early-exit. + */ + size_t size = agx_ppp_update_size(&dirty); + if (size == AGX_PPP_HEADER_LENGTH) + return; + + /* Otherwise, allocate enough space for the update and push it. */ + assert(size > AGX_PPP_HEADER_LENGTH); + + struct agx_ptr T = hk_pool_alloc(cmd, size, 64); + if (!T.cpu) + return; + + struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &dirty); + + if (dirty.fragment_control) { + agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) { + cfg.visibility_mode = gfx->occlusion.mode; + cfg.stencil_test_enable = hk_stencil_test_enabled(cmd); + + /* TODO: Consider optimizing this? */ + cfg.two_sided_stencil = cfg.stencil_test_enable; + + cfg.depth_bias_enable = dyn->rs.depth_bias.enable && + gfx->object_type == AGX_OBJECT_TYPE_TRIANGLE; + + /* Always enable scissoring so we may scissor to the viewport (TODO: + * optimize this out if the viewport is the default and the app does + * not use the scissor test) + */ + cfg.scissor_enable = true; + + /* This avoids broken derivatives along primitive edges */ + cfg.disable_tri_merging = gfx->object_type != AGX_OBJECT_TYPE_TRIANGLE; + } + } + + if (dirty.fragment_control_2) { + if (linked_fs) { + /* Annoying, rasterizer_discard seems to be ignored (sometimes?) in the + * main fragment control word and has to be combined into the secondary + * word for reliable behaviour. + */ + agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg, + linked_fs->b.fragment_control) { + + cfg.tag_write_disable = dyn->rs.rasterizer_discard_enable; + } + } else { + /* If there is no fragment shader, we must disable tag writes to avoid + * executing the missing shader. This optimizes depth-only passes. + */ + agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) { + cfg.tag_write_disable = true; + cfg.pass_type = AGX_PASS_TYPE_OPAQUE; + } + } + } + + struct agx_fragment_face_packed fragment_face; + struct agx_fragment_face_2_packed fragment_face_2; + + if (dirty.fragment_front_face) { + bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED; + bool z_test = has_z && dyn->ds.depth.test_enable; + + agx_pack(&fragment_face, FRAGMENT_FACE, cfg) { + cfg.line_width = agx_pack_line_width(dyn->rs.line.width); + cfg.polygon_mode = translate_polygon_mode(dyn->rs.polygon_mode); + cfg.disable_depth_write = !(z_test && dyn->ds.depth.write_enable); + + if (z_test && !gfx->descriptors.root.draw.force_never_in_shader) + cfg.depth_function = translate_compare_op(dyn->ds.depth.compare_op); + else + cfg.depth_function = AGX_ZS_FUNC_ALWAYS; + }; + + agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) { + cfg.stencil_reference = dyn->ds.stencil.front.reference; + } + } + + if (dirty.fragment_front_face_2) { + agx_pack(&fragment_face_2, FRAGMENT_FACE_2, cfg) { + cfg.object_type = gfx->object_type; + + /* TODO: flip the default? */ + if (fs) + cfg.conservative_depth = 0; + } + + if (fs) + agx_merge(fragment_face_2, fs->frag_face, FRAGMENT_FACE_2); + + agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2); + } + + if (dirty.fragment_front_stencil) { + hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.front, + hk_stencil_test_enabled(cmd)); + } + + if (dirty.fragment_back_face) { + assert(dirty.fragment_front_face); + + agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) { + cfg.stencil_reference = dyn->ds.stencil.back.reference; + } + } + + if (dirty.fragment_back_face_2) { + assert(dirty.fragment_front_face_2); + + agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2); + } + + if (dirty.fragment_back_stencil) { + hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.back, + hk_stencil_test_enabled(cmd)); + } + + if (dirty.output_select) { + struct agx_output_select_packed osel = hw_vs->info.uvs.osel; + + if (linked_fs) { + agx_ppp_push_merged_blobs(&ppp, AGX_OUTPUT_SELECT_LENGTH, &osel, + &linked_fs->b.osel); + } else { + agx_ppp_push_packed(&ppp, &osel, OUTPUT_SELECT); + } + } + + assert(dirty.varying_counts_32 == dirty.varying_counts_16); + + if (dirty.varying_counts_32) { + agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_32, + VARYING_COUNTS); + + agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_16, + VARYING_COUNTS); + } + + if (dirty.cull) { + agx_ppp_push(&ppp, CULL, cfg) { + cfg.cull_front = dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT; + cfg.cull_back = dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT; + cfg.front_face_ccw = dyn->rs.front_face != VK_FRONT_FACE_CLOCKWISE; + cfg.flat_shading_vertex = translate_ppp_vertex(gfx->provoking); + cfg.rasterizer_discard = dyn->rs.rasterizer_discard_enable; + + /* We do not support unrestricted depth, so clamping is inverted from + * clipping. This implementation seems to pass CTS without unrestricted + * depth support. + * + * TODO: Make sure this is right with gl_FragDepth. + */ + cfg.depth_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs); + cfg.depth_clamp = !cfg.depth_clip; + + cfg.primitive_msaa = + gfx->object_type == AGX_OBJECT_TYPE_LINE && + dyn->rs.line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR; + } + } + + if (dirty.cull_2) { + agx_ppp_push(&ppp, CULL_2, cfg) { + cfg.needs_primitive_id = gfx->generate_primitive_id; + } + } + + if (dirty.fragment_shader) { + /* TODO: Do less often? */ + hk_reserve_scratch(cmd, cs, fs); + + agx_ppp_push_packed(&ppp, &linked_fs->fs_counts, FRAGMENT_SHADER_WORD_0); + + agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_1, cfg) { + cfg.pipeline = hk_upload_usc_words(cmd, fs, linked_fs); + } + + agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_2, cfg) { + cfg.cf_bindings = gfx->varyings; + } + + agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_3, cfg) + ; + } + + if (dirty.occlusion_query) { + agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) { + cfg.index = gfx->occlusion.index; + } + } + + if (dirty.output_size) { + agx_ppp_push(&ppp, OUTPUT_SIZE, cfg) { + cfg.count = hw_vs->info.uvs.size; + } + } + + agx_ppp_fini(out, &ppp); +} + +static void +hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, + uint32_t draw_id, struct hk_draw draw) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + const struct hk_rendering_state *render = &cmd->state.gfx.render; + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + + struct hk_graphics_state *gfx = &cmd->state.gfx; + + struct hk_shader *hw_vs = hk_bound_hw_vs(gfx); + struct hk_shader *sw_vs = hk_bound_sw_vs(gfx); + + if (!vk_dynamic_graphics_state_any_dirty(dyn) && + !(gfx->dirty & ~HK_DIRTY_INDEX) && !gfx->descriptors.root_dirty && + !gfx->shaders_dirty && !sw_vs->b.info.uses_draw_id && + !sw_vs->b.info.uses_base_param && + !(gfx->linked[MESA_SHADER_VERTEX] && + gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param)) + return; + + struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors; + + assert(cs->current + 0x1000 < cs->end && "already ensured space"); + uint8_t *out = cs->current; + + struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]); + + bool gt_dirty = IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL) || + IS_SHADER_DIRTY(GEOMETRY); + bool vgt_dirty = IS_SHADER_DIRTY(VERTEX) || gt_dirty; + bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT); + + if (IS_DIRTY(CB_BLEND_CONSTANTS)) { + static_assert(sizeof(desc->root.draw.blend_constant) == + sizeof(dyn->cb.blend_constants) && + "common size"); + + memcpy(desc->root.draw.blend_constant, dyn->cb.blend_constants, + sizeof(dyn->cb.blend_constants)); + desc->root_dirty = true; + } + + if (IS_DIRTY(MS_SAMPLE_MASK)) { + desc->root.draw.api_sample_mask = dyn->ms.sample_mask; + desc->root_dirty = true; + } + + if (fs_dirty || IS_DIRTY(DS_DEPTH_TEST_ENABLE) || + IS_DIRTY(DS_DEPTH_COMPARE_OP)) { + + const struct hk_rendering_state *render = &cmd->state.gfx.render; + bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED; + bool z_test = has_z && dyn->ds.depth.test_enable; + + desc->root.draw.force_never_in_shader = + z_test && dyn->ds.depth.compare_op == VK_COMPARE_OP_NEVER && fs && + fs->info.fs.writes_memory; + + desc->root_dirty = true; + } + + /* The main shader must not run tests if the epilog will. */ + bool nontrivial_force_early = + fs && (fs->b.info.early_fragment_tests && + (fs->b.info.writes_sample_mask || fs->info.fs.writes_memory)); + + bool epilog_discards = dyn->ms.alpha_to_coverage_enable || + (fs && (fs->info.fs.epilog_key.write_z || + fs->info.fs.epilog_key.write_s)); + epilog_discards &= !nontrivial_force_early; + + if (fs_dirty || IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE)) { + desc->root.draw.no_epilog_discard = !epilog_discards ? ~0 : 0; + desc->root_dirty = true; + } + + if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) || + IS_DIRTY(VI_BINDING_STRIDES) || vgt_dirty || true /* TODO */) { + + struct hk_fast_link_key_vs key = { + .prolog.hw = (sw_vs == hw_vs), + + /* FIXME: handle pipeline robustness "properly" */ + .prolog.robustness.level = + (dev->vk.enabled_features.robustBufferAccess2 || + dev->vk.enabled_features.pipelineRobustness) + ? AGX_ROBUSTNESS_D3D + : AGX_ROBUSTNESS_GL, + + .prolog.robustness.soft_fault = false /*TODO*/, + }; + + if (!key.prolog.hw) { + key.prolog.sw_index_size_B = + draw.indexed ? agx_index_size_to_B(draw.index_size) : 0; + } + + static_assert(sizeof(key.prolog.component_mask) == + sizeof(sw_vs->info.vs.attrib_components_read)); + BITSET_COPY(key.prolog.component_mask, + sw_vs->info.vs.attrib_components_read); + + u_foreach_bit(a, dyn->vi->attributes_valid) { + struct vk_vertex_attribute_state attr = dyn->vi->attributes[a]; + + assert(dyn->vi->bindings_valid & BITFIELD_BIT(attr.binding)); + struct vk_vertex_binding_state binding = + dyn->vi->bindings[attr.binding]; + + /* nir_assign_io_var_locations compacts vertex inputs, eliminating + * unused inputs. We need to do the same here to match the locations. + */ + unsigned slot = + util_bitcount64(sw_vs->info.vs.attribs_read & BITFIELD_MASK(a)); + + key.prolog.attribs[slot] = (struct agx_velem_key){ + .format = vk_format_to_pipe_format(attr.format), + .stride = dyn->vi_binding_strides[attr.binding], + .divisor = binding.divisor, + .instanced = binding.input_rate == VK_VERTEX_INPUT_RATE_INSTANCE, + }; + } + + hk_update_fast_linked(cmd, sw_vs, &key); + } + + if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) || vgt_dirty || + (gfx->dirty & HK_DIRTY_VB)) { + + uint64_t sink = dev->rodata.zero_sink; + + unsigned slot = 0; + u_foreach_bit(a, sw_vs->info.vs.attribs_read) { + if (dyn->vi->attributes_valid & BITFIELD_BIT(a)) { + struct vk_vertex_attribute_state attr = dyn->vi->attributes[a]; + struct hk_addr_range vb = gfx->vb[attr.binding]; + + desc->root.draw.attrib_clamps[slot] = agx_calculate_vbo_clamp( + vb.addr, sink, vk_format_to_pipe_format(attr.format), vb.range, + dyn->vi_binding_strides[attr.binding], attr.offset, + &desc->root.draw.attrib_base[slot]); + } else { + desc->root.draw.attrib_base[slot] = sink; + desc->root.draw.attrib_clamps[slot] = 0; + } + + ++slot; + } + + desc->root_dirty = true; + } + + if (vgt_dirty || IS_SHADER_DIRTY(FRAGMENT) || + IS_DIRTY(MS_RASTERIZATION_SAMPLES) || IS_DIRTY(MS_SAMPLE_MASK) || + IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE) || + IS_DIRTY(MS_ALPHA_TO_ONE_ENABLE) || IS_DIRTY(CB_LOGIC_OP) || + IS_DIRTY(CB_LOGIC_OP_ENABLE) || IS_DIRTY(CB_WRITE_MASKS) || + IS_DIRTY(CB_COLOR_WRITE_ENABLES) || IS_DIRTY(CB_ATTACHMENT_COUNT) || + IS_DIRTY(CB_BLEND_ENABLES) || IS_DIRTY(CB_BLEND_EQUATIONS) || + IS_DIRTY(CB_BLEND_CONSTANTS) || + desc->root_dirty /* for pipeline stats */ || true) { + + if (fs) { + unsigned samples_shaded = 0; + if (fs->info.fs.epilog_key.sample_shading) + samples_shaded = dyn->ms.rasterization_samples; + + unsigned tib_sample_mask = + BITFIELD_MASK(dyn->ms.rasterization_samples); + unsigned api_sample_mask = dyn->ms.sample_mask & tib_sample_mask; + bool has_sample_mask = api_sample_mask != tib_sample_mask; + + struct hk_fast_link_key_fs key = { + .prolog.statistics = hk_pipeline_stat_addr( + cmd, + VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT), + + .prolog.cull_distance_size = + hw_vs->info.vs.cull_distance_array_size, + .prolog.api_sample_mask = has_sample_mask ? api_sample_mask : 0xff, + .nr_samples_shaded = samples_shaded, + }; + + bool prolog_discards = + has_sample_mask || key.prolog.cull_distance_size; + + bool needs_prolog = key.prolog.statistics || prolog_discards; + + if (needs_prolog) { + /* With late main shader tests, the prolog runs tests if neither the + * main shader nor epilog will. + * + * With (nontrivial) early main shader tests, the prolog does not + * run tests, the tests will run at the start of the main shader. + * This ensures tests are after API sample mask and cull distance + * discards. + */ + key.prolog.run_zs_tests = !nontrivial_force_early && + !fs->b.info.writes_sample_mask && + !epilog_discards && prolog_discards; + + if (key.prolog.cull_distance_size) { + key.prolog.cf_base = fs->b.info.varyings.fs.nr_cf; + } + } + + key.epilog = (struct agx_fs_epilog_key){ + .link = fs->info.fs.epilog_key, + .nr_samples = MAX2(dyn->ms.rasterization_samples, 1), + .blend.alpha_to_coverage = dyn->ms.alpha_to_coverage_enable, + .blend.alpha_to_one = dyn->ms.alpha_to_one_enable, + .blend.logicop_func = dyn->cb.logic_op_enable + ? vk_logic_op_to_pipe(dyn->cb.logic_op) + : PIPE_LOGICOP_COPY, + }; + + key.epilog.link.already_ran_zs |= nontrivial_force_early; + + struct hk_rendering_state *render = &cmd->state.gfx.render; + for (uint32_t i = 0; i < render->color_att_count; i++) { + key.epilog.rt_formats[i] = + vk_format_to_pipe_format(render->color_att[i].vk_format); + + const struct vk_color_blend_attachment_state *cb = + &dyn->cb.attachments[i]; + + bool write_enable = dyn->cb.color_write_enables & BITFIELD_BIT(i); + unsigned write_mask = write_enable ? cb->write_mask : 0; + + /* nir_lower_blend always blends, so use a default blend state when + * blending is disabled at an API level. + */ + if (!dyn->cb.attachments[i].blend_enable) { + key.epilog.blend.rt[i] = (struct agx_blend_rt_key){ + .colormask = write_mask, + .rgb_func = PIPE_BLEND_ADD, + .alpha_func = PIPE_BLEND_ADD, + .rgb_src_factor = PIPE_BLENDFACTOR_ONE, + .alpha_src_factor = PIPE_BLENDFACTOR_ONE, + .rgb_dst_factor = PIPE_BLENDFACTOR_ZERO, + .alpha_dst_factor = PIPE_BLENDFACTOR_ZERO, + }; + } else { + key.epilog.blend.rt[i] = (struct agx_blend_rt_key){ + .colormask = write_mask, + + .rgb_src_factor = + vk_blend_factor_to_pipe(cb->src_color_blend_factor), + + .rgb_dst_factor = + vk_blend_factor_to_pipe(cb->dst_color_blend_factor), + + .rgb_func = vk_blend_op_to_pipe(cb->color_blend_op), + + .alpha_src_factor = + vk_blend_factor_to_pipe(cb->src_alpha_blend_factor), + + .alpha_dst_factor = + vk_blend_factor_to_pipe(cb->dst_alpha_blend_factor), + + .alpha_func = vk_blend_op_to_pipe(cb->alpha_blend_op), + }; + } + } + + hk_update_fast_linked(cmd, fs, &key); + } else { + /* TODO: prolog without fs needs to work too... */ + if (cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] != NULL) { + cmd->state.gfx.linked_dirty |= BITFIELD_BIT(MESA_SHADER_FRAGMENT); + cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] = NULL; + } + } + } + + /* If the vertex shader uses draw parameters, vertex uniforms are dirty every + * draw. Fragment uniforms are unaffected. + * + * For a direct draw, we upload the draw parameters as-if indirect to + * avoid keying to indirectness. + */ + if (gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param) { + if (draw.b.indirect) { + gfx->draw_params = draw.b.ptr; + + if (draw.indexed) { + gfx->draw_params += + offsetof(VkDrawIndexedIndirectCommand, vertexOffset); + } else { + gfx->draw_params += offsetof(VkDrawIndirectCommand, firstVertex); + } + } else { + uint32_t params[] = { + draw.indexed ? draw.index_bias : draw.start, + draw.start_instance, + }; + + gfx->draw_params = hk_pool_upload(cmd, params, sizeof(params), 4); + } + } else { + gfx->draw_params = 0; + } + + if (sw_vs->b.info.uses_draw_id) { + /* TODO: rodata? */ + gfx->draw_id_ptr = hk_pool_upload(cmd, &draw_id, 2, 4); + } else { + gfx->draw_id_ptr = 0; + } + + if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || gt_dirty) { + enum mesa_prim prim = hk_rast_prim(cmd); + + gfx->topology = translate_hw_primitive_topology(prim); + gfx->object_type = translate_object_type(prim); + } + + if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || IS_DIRTY(RS_PROVOKING_VERTEX)) { + unsigned provoking; + if (dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) + provoking = 2; + else if (gfx->topology == AGX_PRIMITIVE_TRIANGLE_FAN) + provoking = 1; + else + provoking = 0; + + if (provoking != gfx->provoking) { + gfx->provoking = provoking; + gfx->dirty |= HK_DIRTY_PROVOKING; + + gfx->descriptors.root.draw.provoking = provoking; + gfx->descriptors.root_dirty = true; + } + } + + /* With attachmentless rendering, we don't know the sample count until draw + * time, so we do a late tilebuffer fix up. But with rasterizer discard, + * rasterization_samples might be 0. + */ + if (dyn->ms.rasterization_samples && + gfx->render.tilebuffer.nr_samples != dyn->ms.rasterization_samples) { + + assert(gfx->render.tilebuffer.nr_samples == 0); + + unsigned nr_samples = MAX2(dyn->ms.rasterization_samples, 1); + gfx->render.tilebuffer.nr_samples = nr_samples; + agx_tilebuffer_pack_usc(&gfx->render.tilebuffer); + cs->tib = gfx->render.tilebuffer; + } + + if (IS_DIRTY(MS_SAMPLE_LOCATIONS) || IS_DIRTY(MS_SAMPLE_LOCATIONS_ENABLE) || + IS_DIRTY(MS_RASTERIZATION_SAMPLES)) { + + uint32_t ctrl; + if (dyn->ms.sample_locations_enable) { + ctrl = hk_pack_ppp_multisamplectrl(dyn->ms.sample_locations); + } else { + ctrl = hk_default_sample_positions(dyn->ms.rasterization_samples); + } + + bool dont_commit = cmd->in_meta || dyn->ms.rasterization_samples == 0; + + if (!cs->has_sample_locations) { + cs->ppp_multisamplectl = ctrl; + + /* If we're in vk_meta, do not commit to the sample locations yet. + * vk_meta doesn't care, but the app will! + */ + cs->has_sample_locations |= !dont_commit; + } else { + assert(dont_commit || cs->ppp_multisamplectl == ctrl); + } + + gfx->descriptors.root.draw.ppp_multisamplectl = ctrl; + gfx->descriptors.root_dirty = true; + } + + /* Link varyings before uploading tessellation state, becuase the + * gfx->generate_primitive_id boolean needs to be plumbed. + */ + struct hk_linked_shader *linked_vs = gfx->linked[MESA_SHADER_VERTEX]; + struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT]; + bool linked_vs_dirty = IS_LINKED_DIRTY(VERTEX); + bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT); + + if ((gfx->dirty & HK_DIRTY_PROVOKING) || vgt_dirty || linked_fs_dirty) { + unsigned bindings = linked_fs ? linked_fs->b.cf.nr_bindings : 0; + if (bindings) { + size_t linkage_size = + AGX_CF_BINDING_HEADER_LENGTH + (bindings * AGX_CF_BINDING_LENGTH); + + struct agx_ptr t = hk_pool_usc_alloc(cmd, linkage_size, 16); + if (!t.cpu) + return; + + agx_link_varyings_vs_fs( + t.cpu, &gfx->linked_varyings, hw_vs->info.uvs.user_size, + &linked_fs->b.cf, gfx->provoking, 0, &gfx->generate_primitive_id); + + gfx->varyings = t.gpu; + } else { + gfx->varyings = 0; + } + + gfx->dirty |= HK_DIRTY_VARYINGS; + } + + if (gfx->shaders[MESA_SHADER_TESS_EVAL] || + gfx->shaders[MESA_SHADER_GEOMETRY]) { + + struct hk_shader *vs = hk_bound_sw_vs(gfx); + desc->root.draw.vertex_outputs = vs->b.info.outputs; + + /* XXX: We should deduplicate this logic */ + bool restart = (draw.indexed && draw.restart); + bool indirect = draw.b.indirect || restart; + + desc->root.draw.input_assembly = + indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu + : hk_upload_ia_params(cmd, draw); + + if (!indirect) { + uint32_t verts = draw.b.count[0], instances = draw.b.count[1]; + unsigned vb_size = + libagx_tcs_in_size(verts * instances, vs->b.info.outputs); + + /* Allocate if there are any outputs, or use the null sink to trap + * reads if there aren't. Those reads are undefined but should not + * fault. Affects: + * + * dEQP-VK.pipeline.monolithic.no_position.explicit_declarations.basic.single_view.v0_g1 + */ + desc->root.draw.vertex_output_buffer = + vb_size ? hk_pool_alloc(cmd, vb_size, 4).gpu + : dev->rodata.null_sink; + } + } + + if (gfx->shaders[MESA_SHADER_TESS_EVAL]) { + gfx->descriptors.root.draw.tess_params = hk_upload_tess_params(cmd, draw); + gfx->descriptors.root_dirty = true; + } + + if (gfx->shaders[MESA_SHADER_GEOMETRY]) { + /* TODO: size */ + cmd->geom_indirect = hk_pool_alloc(cmd, 64, 4).gpu; + + gfx->descriptors.root.draw.geometry_params = + hk_upload_geometry_params(cmd, draw); + + gfx->descriptors.root_dirty = true; + } + + /* Root must be uploaded after the above, which touch the root */ + if (gfx->descriptors.root_dirty) { + gfx->root = + hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS); + } + + /* Hardware dynamic state must be deferred until after the root and fast + * linking, since it will use the root address and the linked shaders. + */ + if ((gfx->dirty & (HK_DIRTY_PROVOKING | HK_DIRTY_VARYINGS)) || + IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_vs_dirty || vgt_dirty || + gfx->descriptors.root_dirty || gfx->draw_id_ptr || gfx->draw_params) { + + /* TODO: Do less often? */ + hk_reserve_scratch(cmd, cs, hw_vs); + + agx_push(out, VDM_STATE, cfg) { + cfg.vertex_shader_word_0_present = true; + cfg.vertex_shader_word_1_present = true; + cfg.vertex_outputs_present = true; + cfg.vertex_unknown_present = true; + } + + agx_push_packed(out, hw_vs->counts, VDM_STATE_VERTEX_SHADER_WORD_0); + + struct hk_linked_shader *linked_hw_vs = + (hw_vs == sw_vs) ? linked_vs : hw_vs->only_linked; + + agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) { + cfg.pipeline = hk_upload_usc_words(cmd, hw_vs, linked_hw_vs); + } + + agx_push_packed(out, hw_vs->info.uvs.vdm, VDM_STATE_VERTEX_OUTPUTS); + + agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) { + cfg.flat_shading_control = translate_vdm_vertex(gfx->provoking); + cfg.unknown_4 = cfg.unknown_5 = dyn->rs.rasterizer_discard_enable; + cfg.generate_primitive_id = gfx->generate_primitive_id; + } + + /* Pad up to a multiple of 8 bytes */ + memset(out, 0, 4); + out += 4; + } + + if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS)) { + void *ptr = + util_dynarray_grow_bytes(&cs->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH); + + agx_pack(ptr, DEPTH_BIAS, cfg) { + cfg.depth_bias = dyn->rs.depth_bias.constant; + cfg.slope_scale = dyn->rs.depth_bias.slope; + cfg.clamp = dyn->rs.depth_bias.clamp; + + /* Value from the PowerVR driver. */ + if (render->depth_att.vk_format == VK_FORMAT_D16_UNORM) { + cfg.depth_bias /= (1 << 15); + } + } + } + + /* Hardware viewport/scissor state is entangled with depth bias. */ + if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(VP_SCISSORS) || + IS_DIRTY(VP_SCISSOR_COUNT) || IS_DIRTY(VP_VIEWPORTS) || + IS_DIRTY(VP_VIEWPORT_COUNT) || + IS_DIRTY(VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) || + IS_DIRTY(RS_DEPTH_CLIP_ENABLE) || IS_DIRTY(RS_DEPTH_CLAMP_ENABLE)) { + + hk_flush_vp_state(cmd, cs, &out); + } + + hk_flush_ppp_state(cmd, cs, &out); + cs->current = out; + + vk_dynamic_graphics_state_clear_dirty(dyn); + gfx->shaders_dirty = 0; + gfx->linked_dirty = 0; + gfx->dirty = 0; + gfx->descriptors.root_dirty = false; +} + +static bool +hk_needs_index_robustness(struct hk_cmd_buffer *cmd, struct hk_draw draw) +{ + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + if (!draw.indexed) + return false; + + /* If tessellation is used, we'll go through the robust path anyway, don't + * end up with a tess+geom doom combo. + */ + if (gfx->shaders[MESA_SHADER_TESS_EVAL]) + return false; + + /* Allowed with maint6 without robustness features enabled */ + if (draw.index.range == 0) + return true; + + if (!(dev->vk.enabled_features.robustBufferAccess || + dev->vk.enabled_features.robustBufferAccess2 || + dev->vk.enabled_features.pipelineRobustness)) + return false; + + if (draw.b.indirect) { + return true; + } else { + uint32_t range_B = + (draw.start + draw.b.count[0]) * agx_index_size_to_B(draw.index_size); + + return range_B > draw.index.range; + } +} + +static void +hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct hk_draw draw) +{ + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY]; + + /* If there's an application geometry shader, there's nothing to un/bind */ + if (gs && !gs->is_passthrough) + return; + + /* Determine if we need a geometry shader to emulate XFB or adjacency */ + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + struct hk_shader *last_sw = hk_bound_sw_vs_before_gs(gfx); + uint32_t xfb_outputs = last_sw->info.xfb_info.output_count; + + VkPrimitiveTopology topology = dyn->ia.primitive_topology; + bool adjacency = + (topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY) || + (topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY) || + (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY) || + (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY); + + /* TODO: Don't use a whole GS just for index robustness. */ + bool index_robustness = hk_needs_index_robustness(cmd, draw); + + bool needs_gs = xfb_outputs || adjacency || index_robustness; + + /* Various pipeline statistics are implemented in the pre-GS shader. TODO: + * This could easily be optimized. + */ + VkQueryPipelineStatisticFlagBits ia_statistics[] = { + VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT, + VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT, + VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT, + }; + + bool ia_stats = false; + + for (unsigned i = 0; i < ARRAY_SIZE(ia_statistics); ++i) { + ia_stats |= hk_pipeline_stat_addr(cmd, ia_statistics[i]) != 0; + } + + needs_gs |= ia_stats; + + /* If we already have a matching GS configuration, we're done */ + if ((gs != NULL) == needs_gs) + return; + + /* If we don't need a GS but we do have a passthrough, unbind it */ + if (gs) { + assert(!needs_gs && gs->is_passthrough); + hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL); + return; + } + + /* Else, we need to bind a passthrough GS */ + size_t key_size = + sizeof(struct hk_passthrough_gs_key) + nir_xfb_info_size(xfb_outputs); + struct hk_passthrough_gs_key *key = alloca(key_size); + + *key = (struct hk_passthrough_gs_key){ + .prim = u_decomposed_prim(hk_gs_in_prim(cmd)), + .outputs = last_sw->b.info.outputs, + .clip_distance_array_size = last_sw->info.clip_distance_array_size, + .cull_distance_array_size = last_sw->info.cull_distance_array_size, + }; + + if (xfb_outputs) { + typed_memcpy(key->xfb_stride, last_sw->info.xfb_stride, + ARRAY_SIZE(key->xfb_stride)); + + memcpy(&key->xfb_info, &last_sw->info.xfb_info, + nir_xfb_info_size(xfb_outputs)); + } + + struct hk_device *dev = hk_cmd_buffer_device(cmd); + perf_debug(dev, "Binding passthrough GS for%s%s%s%s\n", + xfb_outputs ? " XFB" : "", adjacency ? " adjacency" : "", + index_robustness ? " robustness" : "", + ia_stats ? " statistics" : ""); + + gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size); + gs->is_passthrough = true; + hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, gs); +} + +static struct hk_cs * +hk_flush_gfx_state(struct hk_cmd_buffer *cmd, uint32_t draw_id, + struct hk_draw draw) +{ + struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */); + if (!cs) + return NULL; + + hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */); + + struct hk_graphics_state *gfx = &cmd->state.gfx; + struct hk_descriptor_state *desc = &gfx->descriptors; + struct hk_device *dev = hk_cmd_buffer_device(cmd); + +#ifndef NDEBUG + if (unlikely(dev->dev.debug & AGX_DBG_DIRTY)) { + hk_cmd_buffer_dirty_all(cmd); + } +#endif + + /* TODO: Try to reduce draw overhead of this */ + hk_handle_passthrough_gs(cmd, draw); + + hk_flush_shaders(cmd); + + if (desc->push_dirty) + hk_cmd_buffer_flush_push_descriptors(cmd, desc); + + if ((gfx->dirty & HK_DIRTY_INDEX) && + (gfx->index.restart || gfx->shaders[MESA_SHADER_GEOMETRY])) + hk_flush_index(cmd, cs); + + hk_flush_dynamic_state(cmd, cs, draw_id, draw); + return cs; +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer _buffer, + VkDeviceSize offset, VkDeviceSize size, + VkIndexType indexType) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_buffer, buffer, _buffer); + + cmd->state.gfx.index = (struct hk_index_buffer_state){ + .buffer = hk_buffer_addr_range(buffer, offset, size), + .size = agx_translate_index_size(vk_index_type_to_bytes(indexType)), + .restart = vk_index_to_restart(indexType), + }; + + /* TODO: check if necessary, blob does this */ + cmd->state.gfx.index.buffer.range = + align(cmd->state.gfx.index.buffer.range, 4); + + cmd->state.gfx.dirty |= HK_DIRTY_INDEX; +} + +void +hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx, + struct hk_addr_range addr_range) +{ + cmd->state.gfx.vb[vb_idx] = addr_range; + cmd->state.gfx.dirty |= HK_DIRTY_VB; +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding, + uint32_t bindingCount, const VkBuffer *pBuffers, + const VkDeviceSize *pOffsets, + const VkDeviceSize *pSizes, + const VkDeviceSize *pStrides) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + + if (pStrides) { + vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding, bindingCount, + pStrides); + } + + for (uint32_t i = 0; i < bindingCount; i++) { + VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]); + uint32_t idx = firstBinding + i; + + uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE; + const struct hk_addr_range addr_range = + hk_buffer_addr_range(buffer, pOffsets[i], size); + + hk_cmd_bind_vertex_buffer(cmd, idx, addr_range); + } +} + +static bool +hk_set_view_index(struct hk_cmd_buffer *cmd, uint32_t view_idx) +{ + if (cmd->state.gfx.render.view_mask) { + cmd->state.gfx.descriptors.root.draw.view_index = view_idx; + cmd->state.gfx.descriptors.root_dirty = true; + } + + return true; +} + +/* + * Iterator macro to duplicate a draw for each enabled view (when multiview is + * enabled, else always view 0). Along with hk_lower_multiview, this forms the + * world's worst multiview lowering. + */ +#define hk_foreach_view(cmd) \ + u_foreach_bit(view_idx, cmd->state.gfx.render.view_mask ?: 1) \ + if (hk_set_view_index(cmd, view_idx)) + +static void +hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_draw draw, + uint64_t ia_vertices, uint64_t vs_invocations) +{ + /* XXX: stream link needed? */ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + perf_debug(dev, "Input assembly counters"); + + struct agx_increment_ia_counters_key key = { + .index_size_B = draw.restart ? agx_index_size_to_B(draw.index_size) : 0, + }; + + uint64_t draw_ptr; + if (draw.b.indirect) { + draw_ptr = draw.b.ptr; + } else { + uint32_t desc[] = {draw.b.count[0], draw.b.count[1], 0}; + draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4); + } + + struct libagx_increment_ia_counters args = { + .ia_vertices = ia_vertices, + .vs_invocations = vs_invocations, + .restart_index = cmd->state.gfx.index.restart, + .draw = draw_ptr, + .index_buffer = draw.index.addr, + .index_buffer_range_el = + key.index_size_B ? (draw.index.range / key.index_size_B) : 0, + }; + + uint64_t wg_size = key.index_size_B ? 1024 : 1; + + struct hk_shader *s = + hk_meta_kernel(dev, agx_nir_increment_ia_counters, &key, sizeof(key)); + + uint64_t push = hk_pool_upload(cmd, &args, sizeof(args), 8); + uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push)); + + hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(wg_size, 1, 1), + hk_grid(wg_size, 1, 1)); +} + +static void +hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct hk_draw draw_) +{ + const struct vk_dynamic_graphics_state *dyn = + &cmd->vk.dynamic_graphics_state; + + /* Filter trivial draws so we don't need to worry about null index buffers */ + if (!draw_.b.indirect && (draw_.b.count[0] == 0 || draw_.b.count[1] == 0)) + return; + + draw_.restart = dyn->ia.primitive_restart_enable; + draw_.index_size = cmd->state.gfx.index.size; + + uint64_t stat_ia_verts = hk_pipeline_stat_addr( + cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT); + + uint64_t stat_vs_inv = hk_pipeline_stat_addr( + cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT); + + bool ia_stats = stat_ia_verts || stat_vs_inv; + + hk_foreach_view(cmd) { + struct hk_draw draw = draw_; + struct hk_cs *cs = hk_flush_gfx_state(cmd, draw_id, draw); + /* If we failed to allocate a control stream, we've already lost the + * device. Just drop the draw so we don't crash. + */ + if (!cs) + return; + + bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY]; + bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]; + struct hk_cs *ccs = NULL; + uint8_t *out = cs->current; + assert(cs->current + 0x1000 < cs->end); + + if (geom || tess || ia_stats) { + ccs = + hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true); + if (!ccs) + return; + } + + if (ia_stats) { + hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_vs_inv); + } + + if (tess) { + draw = hk_launch_tess(cmd, ccs, draw); + + if (draw.raw) { + assert(!geom); + assert(draw.b.indirect); + + agx_push(out, VDM_STREAM_LINK, cfg) { + cfg.target_lo = draw.b.ptr & BITFIELD_MASK(32); + cfg.target_hi = draw.b.ptr >> 32; + cfg.with_return = true; + } + + cs->current = out; + continue; + } + } + + if (geom) { + draw = hk_launch_gs_prerast(cmd, ccs, draw); + + /* We must not draw if the app specified rasterizer discard. This is + * required for both performance (it is pointless to rasterize and + * there are no side effects), but also correctness (no indirect draw + * descriptor will be filled out). + */ + if (dyn->rs.rasterizer_discard_enable) + continue; + } + + uint64_t ib = draw.index.addr; + if (draw.indexed && !draw.b.indirect) + ib += (draw.start << draw.index_size); + + agx_push(out, INDEX_LIST, cfg) { + cfg.primitive = cmd->state.gfx.topology; + + if (draw.b.indirect) { + cfg.indirect_buffer_present = true; + } else { + cfg.instance_count_present = true; + cfg.index_count_present = true; + cfg.start_present = true; + } + + if (draw.indexed) { + cfg.restart_enable = draw.restart; + cfg.index_buffer_hi = ib >> 32; + cfg.index_size = draw.index_size; + + cfg.index_buffer_present = true; + cfg.index_buffer_size_present = true; + } + } + + if (draw.indexed) { + agx_push(out, INDEX_LIST_BUFFER_LO, cfg) { + cfg.buffer_lo = ib; + } + } + + if (draw.b.indirect) { + agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) { + cfg.address_hi = draw.b.ptr >> 32; + cfg.address_lo = draw.b.ptr & BITFIELD_MASK(32); + } + } else { + agx_push(out, INDEX_LIST_COUNT, cfg) { + cfg.count = draw.b.count[0]; + } + + agx_push(out, INDEX_LIST_INSTANCES, cfg) { + cfg.count = draw.b.count[1]; + } + + agx_push(out, INDEX_LIST_START, cfg) { + cfg.start = draw.indexed ? draw.index_bias : draw.start; + } + } + + if (draw.indexed) { + agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) { + cfg.size = draw.index.range; + } + } + + cs->current = out; + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, + uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + + struct hk_draw draw = { + .b = hk_grid(vertexCount, instanceCount, 1), + .start = firstVertex, + .start_instance = firstInstance, + }; + + hk_draw(cmd, 0, draw); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, + const VkMultiDrawInfoEXT *pVertexInfo, + uint32_t instanceCount, uint32_t firstInstance, + uint32_t stride) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + + for (unsigned i = 0; i < drawCount; ++i) { + struct hk_draw draw = { + .b = hk_grid(pVertexInfo->vertexCount, instanceCount, 1), + .start = pVertexInfo->firstVertex, + .start_instance = firstInstance, + }; + + hk_draw(cmd, i, draw); + pVertexInfo = ((void *)pVertexInfo) + stride; + } +} + +static void +hk_draw_indexed(VkCommandBuffer commandBuffer, uint16_t draw_id, + uint32_t indexCount, uint32_t instanceCount, + uint32_t firstIndex, int32_t vertexOffset, + uint32_t firstInstance) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + + struct hk_draw draw = { + .b = hk_grid(indexCount, instanceCount, 1), + .indexed = true, + .index = cmd->state.gfx.index.buffer, + .start = firstIndex, + .index_bias = vertexOffset, + .start_instance = firstInstance, + }; + + hk_draw(cmd, draw_id, draw); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, + uint32_t instanceCount, uint32_t firstIndex, + int32_t vertexOffset, uint32_t firstInstance) +{ + hk_draw_indexed(commandBuffer, 0, indexCount, instanceCount, firstIndex, + vertexOffset, firstInstance); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, + const VkMultiDrawIndexedInfoEXT *pIndexInfo, + uint32_t instanceCount, uint32_t firstInstance, + uint32_t stride, const int32_t *pVertexOffset) +{ + for (unsigned i = 0; i < drawCount; ++i) { + const uint32_t vertex_offset = + pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset; + + hk_draw_indexed(commandBuffer, i, pIndexInfo->indexCount, instanceCount, + pIndexInfo->firstIndex, vertex_offset, firstInstance); + + pIndexInfo = ((void *)pIndexInfo) + stride; + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, + VkDeviceSize offset, uint32_t drawCount, uint32_t stride) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_buffer, buffer, _buffer); + + /* From the Vulkan 1.3.238 spec: + * + * VUID-vkCmdDrawIndirect-drawCount-00476 + * + * "If drawCount is greater than 1, stride must be a multiple of 4 and + * must be greater than or equal to sizeof(VkDrawIndirectCommand)" + * + * and + * + * "If drawCount is less than or equal to one, stride is ignored." + */ + if (drawCount > 1) { + assert(stride % 4 == 0); + assert(stride >= sizeof(VkDrawIndirectCommand)); + } + + for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) { + uint64_t addr = hk_buffer_address(buffer, offset) + stride * draw_id; + hk_draw(cmd, draw_id, hk_draw_indirect(addr)); + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, + VkDeviceSize offset, uint32_t drawCount, + uint32_t stride) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_buffer, buffer, _buffer); + + /* From the Vulkan 1.3.238 spec: + * + * VUID-vkCmdDrawIndexedIndirect-drawCount-00528 + * + * "If drawCount is greater than 1, stride must be a multiple of 4 and + * must be greater than or equal to + * sizeof(VkDrawIndexedIndirectCommand)" + * + * and + * + * "If drawCount is less than or equal to one, stride is ignored." + */ + if (drawCount > 1) { + assert(stride % 4 == 0); + assert(stride >= sizeof(VkDrawIndexedIndirectCommand)); + } + + for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) { + uint64_t addr = hk_buffer_address(buffer, offset) + stride * draw_id; + + hk_draw( + cmd, draw_id, + hk_draw_indexed_indirect(addr, cmd->state.gfx.index.buffer, 0, 0)); + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, + VkDeviceSize offset, VkBuffer countBuffer, + VkDeviceSize countBufferOffset, uint32_t maxDrawCount, + uint32_t stride) +{ + unreachable("TODO"); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, + VkDeviceSize offset, VkBuffer countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, uint32_t stride) +{ + unreachable("TODO"); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, + uint32_t instanceCount, uint32_t firstInstance, + VkBuffer counterBuffer, + VkDeviceSize counterBufferOffset, + uint32_t counterOffset, uint32_t vertexStride) +{ + unreachable("TODO"); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer *pBuffers, + const VkDeviceSize *pOffsets, + const VkDeviceSize *pSizes) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_graphics_state *gfx = &cmd->state.gfx; + + for (uint32_t i = 0; i < bindingCount; i++) { + VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]); + uint32_t idx = firstBinding + i; + uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE; + + gfx->xfb[idx] = hk_buffer_addr_range(buffer, pOffsets[i], size); + } +} + +static void +hk_libagx_copy_xfb_counters(nir_builder *b, const void *key) +{ + b->shader->info.workgroup_size_variable = true; + + libagx_copy_xfb_counters(b, nir_load_preamble(b, 1, 64)); +} + +static void +hk_begin_end_xfb(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, + uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, + const VkDeviceSize *pCounterBufferOffsets, bool begin) + +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_device *dev = hk_cmd_buffer_device(cmd); + struct hk_graphics_state *gfx = &cmd->state.gfx; + + gfx->xfb_enabled = begin; + + /* If we haven't reserved XFB offsets yet for the command buffer, do so. */ + if (!gfx->xfb_offsets) { + gfx->xfb_offsets = hk_pool_alloc(cmd, 4 * sizeof(uint32_t), 4).gpu; + } + + struct hk_cs *cs = + hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true); + if (!cs) + return; + hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */); + + struct libagx_xfb_counter_copy params = {}; + unsigned copies = 0; + + /* For CmdBeginTransformFeedbackEXT, we need to initialize everything */ + if (begin) { + for (copies = 0; copies < 4; ++copies) { + params.dest[copies] = gfx->xfb_offsets + copies * sizeof(uint32_t); + } + } + + for (unsigned i = 0; i < counterBufferCount; ++i) { + if (pCounterBuffers[i] == VK_NULL_HANDLE) + continue; + + VK_FROM_HANDLE(hk_buffer, buffer, pCounterBuffers[i]); + + uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0; + uint64_t cb_addr = hk_buffer_address(buffer, offset); + uint32_t cmd_idx = firstCounterBuffer + i; + + if (begin) { + params.src[cmd_idx] = cb_addr; + } else { + params.dest[copies] = cb_addr; + params.src[copies] = gfx->xfb_offsets + cmd_idx * sizeof(uint32_t); + ++copies; + } + } + + if (begin) + copies = 4; + + if (copies > 0) { + perf_debug(dev, "XFB counter copy"); + + struct hk_shader *s = + hk_meta_kernel(dev, hk_libagx_copy_xfb_counters, NULL, 0); + + uint64_t push = hk_pool_upload(cmd, ¶ms, sizeof(params), 8); + uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push)); + + hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(copies, 1, 1), + hk_grid(copies, 1, 1)); + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer *pCounterBuffers, + const VkDeviceSize *pCounterBufferOffsets) +{ + hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount, + pCounterBuffers, pCounterBufferOffsets, true); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer *pCounterBuffers, + const VkDeviceSize *pCounterBufferOffsets) +{ + hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount, + pCounterBuffers, pCounterBufferOffsets, false); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdBeginConditionalRenderingEXT( + VkCommandBuffer commandBuffer, + const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin) +{ + unreachable("stub"); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) +{ + unreachable("stub"); +} diff --git a/src/asahi/vulkan/hk_cmd_meta.c b/src/asahi/vulkan/hk_cmd_meta.c new file mode 100644 index 00000000000..ee70d9d0d3c --- /dev/null +++ b/src/asahi/vulkan/hk_cmd_meta.c @@ -0,0 +1,1692 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "vulkan/vulkan_core.h" +#include "agx_pack.h" +#include "hk_buffer.h" +#include "hk_cmd_buffer.h" +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_image.h" +#include "hk_physical_device.h" + +#include "nir_builder.h" +#include "shader_enums.h" +#include "vk_format.h" +#include "vk_meta.h" +#include "vk_pipeline.h" + +static VkResult +hk_cmd_bind_map_buffer(struct vk_command_buffer *vk_cmd, + struct vk_meta_device *meta, VkBuffer _buffer, + void **map_out) +{ + struct hk_cmd_buffer *cmd = container_of(vk_cmd, struct hk_cmd_buffer, vk); + VK_FROM_HANDLE(hk_buffer, buffer, _buffer); + + assert(buffer->vk.size < UINT_MAX); + struct agx_ptr T = hk_pool_alloc(cmd, buffer->vk.size, 16); + if (unlikely(T.cpu == NULL)) + return VK_ERROR_OUT_OF_POOL_MEMORY; + + buffer->addr = T.gpu; + *map_out = T.cpu; + return VK_SUCCESS; +} + +VkResult +hk_device_init_meta(struct hk_device *dev) +{ + VkResult result = vk_meta_device_init(&dev->vk, &dev->meta); + if (result != VK_SUCCESS) + return result; + + dev->meta.use_gs_for_layer = false; + dev->meta.use_stencil_export = true; + dev->meta.cmd_bind_map_buffer = hk_cmd_bind_map_buffer; + dev->meta.max_bind_map_buffer_size_B = 64 * 1024; + + return VK_SUCCESS; +} + +void +hk_device_finish_meta(struct hk_device *dev) +{ + vk_meta_device_finish(&dev->vk, &dev->meta); +} + +struct hk_meta_save { + struct vk_vertex_input_state _dynamic_vi; + struct vk_sample_locations_state _dynamic_sl; + struct vk_dynamic_graphics_state dynamic; + struct hk_api_shader *shaders[MESA_SHADER_MESH + 1]; + struct hk_addr_range vb0; + struct hk_descriptor_set *desc0; + bool has_push_desc0; + enum agx_visibility_mode occlusion; + struct hk_push_descriptor_set push_desc0; + VkQueryPipelineStatisticFlags pipeline_stats_flags; + uint8_t push[128]; +}; + +static void +hk_meta_begin(struct hk_cmd_buffer *cmd, struct hk_meta_save *save, + VkPipelineBindPoint bind_point) +{ + struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point); + + if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { + save->dynamic = cmd->vk.dynamic_graphics_state; + save->_dynamic_vi = cmd->state.gfx._dynamic_vi; + save->_dynamic_sl = cmd->state.gfx._dynamic_sl; + + static_assert(sizeof(cmd->state.gfx.shaders) == sizeof(save->shaders)); + memcpy(save->shaders, cmd->state.gfx.shaders, sizeof(save->shaders)); + + /* Pause queries */ + save->occlusion = cmd->state.gfx.occlusion.mode; + cmd->state.gfx.occlusion.mode = AGX_VISIBILITY_MODE_NONE; + cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION; + + save->pipeline_stats_flags = desc->root.draw.pipeline_stats_flags; + desc->root.draw.pipeline_stats_flags = 0; + desc->root_dirty = true; + } else { + save->shaders[MESA_SHADER_COMPUTE] = cmd->state.cs.shader; + } + + save->vb0 = cmd->state.gfx.vb[0]; + + save->desc0 = desc->sets[0]; + save->has_push_desc0 = desc->push[0]; + if (save->has_push_desc0) + save->push_desc0 = *desc->push[0]; + + static_assert(sizeof(save->push) == sizeof(desc->root.push)); + memcpy(save->push, desc->root.push, sizeof(save->push)); + + cmd->in_meta = true; +} + +static void +hk_meta_init_render(struct hk_cmd_buffer *cmd, + struct vk_meta_rendering_info *info) +{ + const struct hk_rendering_state *render = &cmd->state.gfx.render; + + *info = (struct vk_meta_rendering_info){ + .samples = MAX2(render->tilebuffer.nr_samples, 1), + .view_mask = render->view_mask, + .color_attachment_count = render->color_att_count, + .depth_attachment_format = render->depth_att.vk_format, + .stencil_attachment_format = render->stencil_att.vk_format, + }; + for (uint32_t a = 0; a < render->color_att_count; a++) + info->color_attachment_formats[a] = render->color_att[a].vk_format; +} + +static void +hk_meta_end(struct hk_cmd_buffer *cmd, struct hk_meta_save *save, + VkPipelineBindPoint bind_point) +{ + struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point); + desc->root_dirty = true; + + if (save->desc0) { + desc->sets[0] = save->desc0; + desc->root.sets[0] = hk_descriptor_set_addr(save->desc0); + desc->sets_dirty |= BITFIELD_BIT(0); + desc->push_dirty &= ~BITFIELD_BIT(0); + } else if (save->has_push_desc0) { + *desc->push[0] = save->push_desc0; + desc->push_dirty |= BITFIELD_BIT(0); + } + + if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { + /* Restore the dynamic state */ + assert(save->dynamic.vi == &cmd->state.gfx._dynamic_vi); + assert(save->dynamic.ms.sample_locations == &cmd->state.gfx._dynamic_sl); + cmd->vk.dynamic_graphics_state = save->dynamic; + cmd->state.gfx._dynamic_vi = save->_dynamic_vi; + cmd->state.gfx._dynamic_sl = save->_dynamic_sl; + memcpy(cmd->vk.dynamic_graphics_state.dirty, + cmd->vk.dynamic_graphics_state.set, + sizeof(cmd->vk.dynamic_graphics_state.set)); + + for (uint32_t stage = 0; stage < ARRAY_SIZE(save->shaders); stage++) { + hk_cmd_bind_graphics_shader(cmd, stage, save->shaders[stage]); + } + + hk_cmd_bind_vertex_buffer(cmd, 0, save->vb0); + + /* Restore queries */ + cmd->state.gfx.occlusion.mode = save->occlusion; + cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION; + + desc->root.draw.pipeline_stats_flags = save->pipeline_stats_flags; + desc->root_dirty = true; + } else { + hk_cmd_bind_compute_shader(cmd, save->shaders[MESA_SHADER_COMPUTE]); + } + + memcpy(desc->root.push, save->push, sizeof(save->push)); + cmd->in_meta = false; +} + +#define VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE (0xcafe0000) +#define VK_META_OBJECT_KEY_FILL_PIPELINE (0xcafe0001) + +#define BINDING_OUTPUT 0 +#define BINDING_INPUT 1 + +static VkFormat +aspect_format(VkFormat fmt, VkImageAspectFlags aspect) +{ + bool depth = (aspect & VK_IMAGE_ASPECT_DEPTH_BIT); + bool stencil = (aspect & VK_IMAGE_ASPECT_STENCIL_BIT); + + enum pipe_format p_format = vk_format_to_pipe_format(fmt); + + if (util_format_is_depth_or_stencil(p_format)) { + assert(depth ^ stencil); + if (depth) { + switch (fmt) { + case VK_FORMAT_D32_SFLOAT: + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return VK_FORMAT_D32_SFLOAT; + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_D16_UNORM_S8_UINT: + return VK_FORMAT_D16_UNORM; + default: + unreachable("invalid depth"); + } + } else { + switch (fmt) { + case VK_FORMAT_S8_UINT: + case VK_FORMAT_D32_SFLOAT_S8_UINT: + case VK_FORMAT_D16_UNORM_S8_UINT: + return VK_FORMAT_S8_UINT; + default: + unreachable("invalid stencil"); + } + } + } + + assert(!depth && !stencil); + + const struct vk_format_ycbcr_info *ycbcr_info = + vk_format_get_ycbcr_info(fmt); + + if (ycbcr_info) { + switch (aspect) { + case VK_IMAGE_ASPECT_PLANE_0_BIT: + return ycbcr_info->planes[0].format; + case VK_IMAGE_ASPECT_PLANE_1_BIT: + return ycbcr_info->planes[1].format; + case VK_IMAGE_ASPECT_PLANE_2_BIT: + return ycbcr_info->planes[2].format; + default: + unreachable("invalid ycbcr aspect"); + } + } + + return fmt; +} + +static VkFormat +canonical_format(VkFormat fmt) +{ + enum pipe_format p_format = vk_format_to_pipe_format(fmt); + + if (util_format_is_depth_or_stencil(p_format)) + return fmt; + + switch (util_format_get_blocksize(p_format)) { + case 1: + return VK_FORMAT_R8_UINT; + case 2: + return VK_FORMAT_R16_UINT; + case 4: + return VK_FORMAT_R32_UINT; + case 8: + return VK_FORMAT_R32G32_UINT; + case 16: + return VK_FORMAT_R32G32B32A32_UINT; + default: + unreachable("invalid bpp"); + } +} + +enum copy_type { + BUF2IMG, + IMG2BUF, + IMG2IMG, +}; + +struct vk_meta_push_data { + uint32_t buffer_offset; + uint32_t row_extent; + uint32_t slice_or_layer_extent; + + int32_t src_offset_el[4]; + int32_t dst_offset_el[4]; + uint32_t grid_el[3]; +} PACKED; + +#define get_push(b, name) \ + nir_load_push_constant( \ + b, 1, sizeof(((struct vk_meta_push_data *)0)->name) * 8, \ + nir_imm_int(b, offsetof(struct vk_meta_push_data, name))) + +struct vk_meta_image_copy_key { + enum vk_meta_object_key_type key_type; + enum copy_type type; + unsigned block_size; + unsigned nr_samples; +}; + +static nir_def * +linearize_coords(nir_builder *b, nir_def *coord, + const struct vk_meta_image_copy_key *key) +{ + assert(key->nr_samples == 1 && "buffer<-->image copies not multisampled"); + + nir_def *row_extent = get_push(b, row_extent); + nir_def *slice_or_layer_extent = get_push(b, slice_or_layer_extent); + nir_def *x = nir_channel(b, coord, 0); + nir_def *y = nir_channel(b, coord, 1); + nir_def *z_or_layer = nir_channel(b, coord, 2); + + nir_def *v = get_push(b, buffer_offset); + + v = nir_iadd(b, v, nir_imul_imm(b, x, key->block_size)); + v = nir_iadd(b, v, nir_imul(b, y, row_extent)); + v = nir_iadd(b, v, nir_imul(b, z_or_layer, slice_or_layer_extent)); + + return nir_udiv_imm(b, v, key->block_size); +} + +static nir_shader * +build_image_copy_shader(const struct vk_meta_image_copy_key *key) +{ + nir_builder build = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "vk-meta-copy"); + + nir_builder *b = &build; + b->shader->info.workgroup_size[0] = 32; + b->shader->info.workgroup_size[1] = 32; + + bool src_is_buf = key->type == BUF2IMG; + bool dst_is_buf = key->type == IMG2BUF; + + bool msaa = key->nr_samples > 1; + enum glsl_sampler_dim dim_2d = + msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D; + enum glsl_sampler_dim dim_src = src_is_buf ? GLSL_SAMPLER_DIM_BUF : dim_2d; + enum glsl_sampler_dim dim_dst = dst_is_buf ? GLSL_SAMPLER_DIM_BUF : dim_2d; + + const struct glsl_type *texture_type = + glsl_sampler_type(dim_src, false, !src_is_buf, GLSL_TYPE_UINT); + + const struct glsl_type *image_type = + glsl_image_type(dim_dst, !dst_is_buf, GLSL_TYPE_UINT); + + nir_variable *texture = + nir_variable_create(b->shader, nir_var_uniform, texture_type, "source"); + nir_variable *image = + nir_variable_create(b->shader, nir_var_image, image_type, "dest"); + + image->data.descriptor_set = 0; + image->data.binding = BINDING_OUTPUT; + image->data.access = ACCESS_NON_READABLE; + + texture->data.descriptor_set = 0; + texture->data.binding = BINDING_INPUT; + + /* Grab the offset vectors */ + nir_def *src_offset_el = nir_load_push_constant( + b, 3, 32, + nir_imm_int(b, offsetof(struct vk_meta_push_data, src_offset_el))); + + nir_def *dst_offset_el = nir_load_push_constant( + b, 3, 32, + nir_imm_int(b, offsetof(struct vk_meta_push_data, dst_offset_el))); + + nir_def *grid_el = nir_load_push_constant( + b, 3, 32, nir_imm_int(b, offsetof(struct vk_meta_push_data, grid_el))); + + /* We're done setting up variables, do the copy */ + nir_def *coord = nir_load_global_invocation_id(b, 32); + + nir_push_if(b, + nir_ball(b, nir_trim_vector(b, nir_ult(b, coord, grid_el), 2))); + { + nir_def *src_coord = nir_iadd(b, coord, src_offset_el); + nir_def *dst_coord = nir_iadd(b, coord, dst_offset_el); + + /* Special case handle buffer indexing */ + if (dst_is_buf) { + dst_coord = linearize_coords(b, coord, key); + } else if (src_is_buf) { + src_coord = linearize_coords(b, coord, key); + } + + /* Copy formatted texel from texture to storage image */ + for (unsigned s = 0; s < key->nr_samples; ++s) { + nir_deref_instr *deref = nir_build_deref_var(b, texture); + nir_def *ms_index = nir_imm_int(b, s); + + nir_def *value = msaa ? nir_txf_ms_deref(b, deref, src_coord, ms_index) + : nir_txf_deref(b, deref, src_coord, NULL); + + nir_image_deref_store(b, &nir_build_deref_var(b, image)->def, + nir_pad_vec4(b, dst_coord), ms_index, value, + nir_imm_int(b, 0), .image_dim = dim_dst, + .image_array = !dst_is_buf); + } + } + nir_pop_if(b, NULL); + return b->shader; +} + +static VkResult +get_image_copy_descriptor_set_layout(struct vk_device *device, + struct vk_meta_device *meta, + VkDescriptorSetLayout *layout_out, + enum copy_type type) +{ + const char *keys[] = { + [IMG2BUF] = "vk-meta-copy-image-to-buffer-descriptor-set-layout", + [BUF2IMG] = "vk-meta-copy-buffer-to-image-descriptor-set-layout", + [IMG2IMG] = "vk-meta-copy-image-to-image-descriptor-set-layout", + }; + + VkDescriptorSetLayout from_cache = vk_meta_lookup_descriptor_set_layout( + meta, keys[type], strlen(keys[type])); + if (from_cache != VK_NULL_HANDLE) { + *layout_out = from_cache; + return VK_SUCCESS; + } + + const VkDescriptorSetLayoutBinding bindings[] = { + { + .binding = BINDING_OUTPUT, + .descriptorType = type != IMG2BUF + ? VK_DESCRIPTOR_TYPE_STORAGE_IMAGE + : VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .binding = BINDING_INPUT, + .descriptorType = type == BUF2IMG + ? VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER + : VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + + const VkDescriptorSetLayoutCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = ARRAY_SIZE(bindings), + .pBindings = bindings, + }; + + return vk_meta_create_descriptor_set_layout(device, meta, &info, keys[type], + strlen(keys[type]), layout_out); +} + +static VkResult +get_image_copy_pipeline_layout(struct vk_device *device, + struct vk_meta_device *meta, + struct vk_meta_image_copy_key *key, + VkDescriptorSetLayout set_layout, + VkPipelineLayout *layout_out, + enum copy_type type) +{ + const char *keys[] = { + [IMG2BUF] = "vk-meta-copy-image-to-buffer-pipeline-layout", + [BUF2IMG] = "vk-meta-copy-buffer-to-image-pipeline-layout", + [IMG2IMG] = "vk-meta-copy-image-to-image-pipeline-layout", + }; + + VkPipelineLayout from_cache = + vk_meta_lookup_pipeline_layout(meta, keys[type], strlen(keys[type])); + if (from_cache != VK_NULL_HANDLE) { + *layout_out = from_cache; + return VK_SUCCESS; + } + + VkPipelineLayoutCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &set_layout, + }; + + const VkPushConstantRange push_range = { + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = sizeof(struct vk_meta_push_data), + }; + + info.pushConstantRangeCount = 1; + info.pPushConstantRanges = &push_range; + + return vk_meta_create_pipeline_layout(device, meta, &info, keys[type], + strlen(keys[type]), layout_out); +} + +static VkResult +get_image_copy_pipeline(struct vk_device *device, struct vk_meta_device *meta, + const struct vk_meta_image_copy_key *key, + VkPipelineLayout layout, VkPipeline *pipeline_out) +{ + VkPipeline from_cache = vk_meta_lookup_pipeline(meta, key, sizeof(*key)); + if (from_cache != VK_NULL_HANDLE) { + *pipeline_out = from_cache; + return VK_SUCCESS; + } + + const VkPipelineShaderStageNirCreateInfoMESA nir_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_NIR_CREATE_INFO_MESA, + .nir = build_image_copy_shader(key), + }; + const VkPipelineShaderStageCreateInfo cs_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = &nir_info, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .pName = "main", + }; + + const VkComputePipelineCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = cs_info, + .layout = layout, + }; + + VkResult result = vk_meta_create_compute_pipeline( + device, meta, &info, key, sizeof(*key), pipeline_out); + ralloc_free(nir_info.nir); + + return result; +} + +static void +hk_meta_copy_image_to_buffer2(struct vk_command_buffer *cmd, + struct vk_meta_device *meta, + const VkCopyImageToBufferInfo2 *pCopyBufferInfo) +{ + VK_FROM_HANDLE(vk_image, image, pCopyBufferInfo->srcImage); + VK_FROM_HANDLE(vk_image, src_image, pCopyBufferInfo->srcImage); + + struct vk_device *device = cmd->base.device; + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + + VkResult result; + + VkDescriptorSetLayout set_layout; + result = + get_image_copy_descriptor_set_layout(device, meta, &set_layout, IMG2BUF); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + bool per_layer = + util_format_is_compressed(vk_format_to_pipe_format(image->format)); + + for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) { + const VkBufferImageCopy2 *region = &pCopyBufferInfo->pRegions[i]; + + unsigned layers = MAX2(region->imageExtent.depth, + vk_image_subresource_layer_count( + src_image, ®ion->imageSubresource)); + unsigned layer_iters = per_layer ? layers : 1; + + for (unsigned layer_offs = 0; layer_offs < layer_iters; ++layer_offs) { + + VkImageAspectFlags aspect = region->imageSubresource.aspectMask; + VkFormat aspect_fmt = aspect_format(image->format, aspect); + VkFormat canonical = canonical_format(aspect_fmt); + + uint32_t blocksize_B = + util_format_get_blocksize(vk_format_to_pipe_format(canonical)); + + enum pipe_format p_format = vk_format_to_pipe_format(image->format); + + unsigned row_extent = util_format_get_nblocksx( + p_format, MAX2(region->bufferRowLength, + region->imageExtent.width)) * + blocksize_B; + unsigned slice_extent = + util_format_get_nblocksy( + p_format, + MAX2(region->bufferImageHeight, region->imageExtent.height)) * + row_extent; + unsigned layer_extent = + util_format_get_nblocksz(p_format, region->imageExtent.depth) * + slice_extent; + + bool is_3d = region->imageExtent.depth > 1; + + struct vk_meta_image_copy_key key = { + .key_type = VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE, + .type = IMG2BUF, + .block_size = blocksize_B, + .nr_samples = image->samples, + }; + + VkPipelineLayout pipeline_layout; + result = get_image_copy_pipeline_layout(device, meta, &key, set_layout, + &pipeline_layout, false); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + VkImageView src_view; + const VkImageViewUsageCreateInfo src_view_usage = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, + .usage = VK_IMAGE_USAGE_SAMPLED_BIT, + }; + const VkImageViewCreateInfo src_view_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA, + .pNext = &src_view_usage, + .image = pCopyBufferInfo->srcImage, + .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY, + .format = canonical, + .subresourceRange = + { + .aspectMask = region->imageSubresource.aspectMask, + .baseMipLevel = region->imageSubresource.mipLevel, + .baseArrayLayer = + MAX2(region->imageOffset.z, + region->imageSubresource.baseArrayLayer) + + layer_offs, + .layerCount = per_layer ? 1 : layers, + .levelCount = 1, + }, + }; + + result = + vk_meta_create_image_view(cmd, meta, &src_view_info, &src_view); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + VkDescriptorImageInfo src_info = { + .imageLayout = pCopyBufferInfo->srcImageLayout, + .imageView = src_view, + }; + + VkWriteDescriptorSet desc_writes[2]; + + const VkBufferViewCreateInfo dst_view_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, + .buffer = pCopyBufferInfo->dstBuffer, + .format = canonical, + + /* Ideally, this would be region->bufferOffset, but that might not + * be aligned to minTexelBufferOffsetAlignment. Instead, we use a 0 + * offset (which is definitely aligned) and add the offset ourselves + * in the shader. + */ + .offset = 0, + .range = VK_WHOLE_SIZE, + }; + + VkBufferView dst_view; + VkResult result = + vk_meta_create_buffer_view(cmd, meta, &dst_view_info, &dst_view); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + desc_writes[0] = (VkWriteDescriptorSet){ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = 0, + .dstBinding = BINDING_OUTPUT, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, + .descriptorCount = 1, + .pTexelBufferView = &dst_view, + }; + + desc_writes[1] = (VkWriteDescriptorSet){ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = 0, + .dstBinding = BINDING_INPUT, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .descriptorCount = 1, + .pImageInfo = &src_info, + }; + + disp->CmdPushDescriptorSetKHR( + vk_command_buffer_to_handle(cmd), VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline_layout, 0, ARRAY_SIZE(desc_writes), desc_writes); + + VkPipeline pipeline; + result = get_image_copy_pipeline(device, meta, &key, pipeline_layout, + &pipeline); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd), + VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + enum pipe_format p_src_fmt = + vk_format_to_pipe_format(src_image->format); + + struct vk_meta_push_data push = { + .buffer_offset = region->bufferOffset, + .row_extent = row_extent, + .slice_or_layer_extent = is_3d ? slice_extent : layer_extent, + + .src_offset_el[0] = + util_format_get_nblocksx(p_src_fmt, region->imageOffset.x), + .src_offset_el[1] = + util_format_get_nblocksy(p_src_fmt, region->imageOffset.y), + + .grid_el[0] = + util_format_get_nblocksx(p_format, region->imageExtent.width), + .grid_el[1] = + util_format_get_nblocksy(p_format, region->imageExtent.height), + .grid_el[2] = per_layer ? 1 : layers, + }; + + push.buffer_offset += push.slice_or_layer_extent * layer_offs; + + disp->CmdPushConstants(vk_command_buffer_to_handle(cmd), + pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, + sizeof(push), &push); + + disp->CmdDispatch(vk_command_buffer_to_handle(cmd), + DIV_ROUND_UP(push.grid_el[0], 32), + DIV_ROUND_UP(push.grid_el[1], 32), push.grid_el[2]); + } + } +} + +static void +hk_meta_copy_buffer_to_image2(struct vk_command_buffer *cmd, + struct vk_meta_device *meta, + const struct VkCopyBufferToImageInfo2 *info) +{ + VK_FROM_HANDLE(vk_image, image, info->dstImage); + + struct vk_device *device = cmd->base.device; + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + + VkDescriptorSetLayout set_layout; + VkResult result = + get_image_copy_descriptor_set_layout(device, meta, &set_layout, BUF2IMG); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + bool per_layer = + util_format_is_compressed(vk_format_to_pipe_format(image->format)); + + for (unsigned r = 0; r < info->regionCount; ++r) { + const VkBufferImageCopy2 *region = &info->pRegions[r]; + + unsigned layers = MAX2( + region->imageExtent.depth, + vk_image_subresource_layer_count(image, ®ion->imageSubresource)); + unsigned layer_iters = per_layer ? layers : 1; + + for (unsigned layer_offs = 0; layer_offs < layer_iters; ++layer_offs) { + VkImageAspectFlags aspect = region->imageSubresource.aspectMask; + VkFormat aspect_fmt = aspect_format(image->format, aspect); + VkFormat canonical = canonical_format(aspect_fmt); + enum pipe_format p_format = vk_format_to_pipe_format(aspect_fmt); + uint32_t blocksize_B = util_format_get_blocksize(p_format); + bool is_3d = region->imageExtent.depth > 1; + + struct vk_meta_image_copy_key key = { + .key_type = VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE, + .type = BUF2IMG, + .block_size = blocksize_B, + .nr_samples = image->samples, + }; + + VkPipelineLayout pipeline_layout; + result = get_image_copy_pipeline_layout(device, meta, &key, set_layout, + &pipeline_layout, true); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + VkWriteDescriptorSet desc_writes[2]; + + unsigned row_extent = util_format_get_nblocksx( + p_format, MAX2(region->bufferRowLength, + region->imageExtent.width)) * + blocksize_B; + unsigned slice_extent = + util_format_get_nblocksy( + p_format, + MAX2(region->bufferImageHeight, region->imageExtent.height)) * + row_extent; + unsigned layer_extent = + util_format_get_nblocksz(p_format, region->imageExtent.depth) * + slice_extent; + + /* Create a view into the source buffer as a texel buffer */ + const VkBufferViewCreateInfo src_view_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, + .buffer = info->srcBuffer, + .format = canonical, + + /* Ideally, this would be region->bufferOffset, but that might not + * be aligned to minTexelBufferOffsetAlignment. Instead, we use a 0 + * offset (which is definitely aligned) and add the offset ourselves + * in the shader. + */ + .offset = 0, + .range = VK_WHOLE_SIZE, + }; + + assert((region->bufferOffset % blocksize_B) == 0 && "must be aligned"); + + VkBufferView src_view; + result = + vk_meta_create_buffer_view(cmd, meta, &src_view_info, &src_view); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + VkImageView dst_view; + const VkImageViewUsageCreateInfo dst_view_usage = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, + .usage = VK_IMAGE_USAGE_STORAGE_BIT, + }; + const VkImageViewCreateInfo dst_view_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA, + .pNext = &dst_view_usage, + .image = info->dstImage, + .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY, + .format = canonical, + .subresourceRange = + { + .aspectMask = region->imageSubresource.aspectMask, + .baseMipLevel = region->imageSubresource.mipLevel, + .baseArrayLayer = + MAX2(region->imageOffset.z, + region->imageSubresource.baseArrayLayer) + + layer_offs, + .layerCount = per_layer ? 1 : layers, + .levelCount = 1, + }, + }; + + result = + vk_meta_create_image_view(cmd, meta, &dst_view_info, &dst_view); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + const VkDescriptorImageInfo dst_info = { + .imageView = dst_view, + .imageLayout = info->dstImageLayout, + }; + + desc_writes[0] = (VkWriteDescriptorSet){ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = 0, + .dstBinding = BINDING_OUTPUT, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = 1, + .pImageInfo = &dst_info, + }; + + desc_writes[1] = (VkWriteDescriptorSet){ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = 0, + .dstBinding = BINDING_INPUT, + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + .descriptorCount = 1, + .pTexelBufferView = &src_view, + }; + + disp->CmdPushDescriptorSetKHR( + vk_command_buffer_to_handle(cmd), VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline_layout, 0, ARRAY_SIZE(desc_writes), desc_writes); + + VkPipeline pipeline; + result = get_image_copy_pipeline(device, meta, &key, pipeline_layout, + &pipeline); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd), + VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + struct vk_meta_push_data push = { + .buffer_offset = region->bufferOffset, + .row_extent = row_extent, + .slice_or_layer_extent = is_3d ? slice_extent : layer_extent, + + .dst_offset_el[0] = + util_format_get_nblocksx(p_format, region->imageOffset.x), + .dst_offset_el[1] = + util_format_get_nblocksy(p_format, region->imageOffset.y), + + .grid_el[0] = + util_format_get_nblocksx(p_format, region->imageExtent.width), + .grid_el[1] = + util_format_get_nblocksy(p_format, region->imageExtent.height), + .grid_el[2] = per_layer ? 1 : layers, + }; + + push.buffer_offset += push.slice_or_layer_extent * layer_offs; + + disp->CmdPushConstants(vk_command_buffer_to_handle(cmd), + pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, + sizeof(push), &push); + + disp->CmdDispatch(vk_command_buffer_to_handle(cmd), + DIV_ROUND_UP(push.grid_el[0], 32), + DIV_ROUND_UP(push.grid_el[1], 32), push.grid_el[2]); + } + } +} + +static void +hk_meta_copy_image2(struct vk_command_buffer *cmd, struct vk_meta_device *meta, + const struct VkCopyImageInfo2 *info) +{ + VK_FROM_HANDLE(vk_image, src_image, info->srcImage); + VK_FROM_HANDLE(vk_image, dst_image, info->dstImage); + + struct vk_device *device = cmd->base.device; + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + + VkDescriptorSetLayout set_layout; + VkResult result = + get_image_copy_descriptor_set_layout(device, meta, &set_layout, BUF2IMG); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + bool per_layer = + util_format_is_compressed(vk_format_to_pipe_format(src_image->format)) || + util_format_is_compressed(vk_format_to_pipe_format(dst_image->format)); + + for (unsigned r = 0; r < info->regionCount; ++r) { + const VkImageCopy2 *region = &info->pRegions[r]; + + unsigned layers = MAX2( + vk_image_subresource_layer_count(src_image, ®ion->srcSubresource), + region->extent.depth); + unsigned layer_iters = per_layer ? layers : 1; + + for (unsigned layer_offs = 0; layer_offs < layer_iters; ++layer_offs) { + u_foreach_bit(aspect, region->srcSubresource.aspectMask) { + /* We use the source format throughout for consistent scaling with + * compressed<-->uncompressed copies, where the extents are defined + * to follow the source. + */ + VkFormat aspect_fmt = aspect_format(src_image->format, 1 << aspect); + VkFormat canonical = canonical_format(aspect_fmt); + uint32_t blocksize_B = + util_format_get_blocksize(vk_format_to_pipe_format(canonical)); + + struct vk_meta_image_copy_key key = { + .key_type = VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE, + .type = IMG2IMG, + .block_size = blocksize_B, + .nr_samples = dst_image->samples, + }; + + assert(key.nr_samples == src_image->samples); + + VkPipelineLayout pipeline_layout; + result = get_image_copy_pipeline_layout( + device, meta, &key, set_layout, &pipeline_layout, true); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + VkWriteDescriptorSet desc_writes[2]; + + VkImageView src_view; + const VkImageViewUsageCreateInfo src_view_usage = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, + .usage = VK_IMAGE_USAGE_SAMPLED_BIT, + }; + const VkImageViewCreateInfo src_view_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA, + .pNext = &src_view_usage, + .image = info->srcImage, + .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY, + .format = canonical, + .subresourceRange = + { + .aspectMask = + region->srcSubresource.aspectMask & (1 << aspect), + .baseMipLevel = region->srcSubresource.mipLevel, + .baseArrayLayer = + MAX2(region->srcOffset.z, + region->srcSubresource.baseArrayLayer) + + layer_offs, + .layerCount = per_layer ? 1 : layers, + .levelCount = 1, + }, + }; + + result = + vk_meta_create_image_view(cmd, meta, &src_view_info, &src_view); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + VkDescriptorImageInfo src_info = { + .imageLayout = info->srcImageLayout, + .imageView = src_view, + }; + + VkImageView dst_view; + const VkImageViewUsageCreateInfo dst_view_usage = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO, + .usage = VK_IMAGE_USAGE_STORAGE_BIT, + }; + const VkImageViewCreateInfo dst_view_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA, + .pNext = &dst_view_usage, + .image = info->dstImage, + .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY, + .format = canonical, + .subresourceRange = + { + .aspectMask = + vk_format_get_ycbcr_info(dst_image->format) || + vk_format_get_ycbcr_info(src_image->format) + ? region->dstSubresource.aspectMask + : (1 << aspect), + .baseMipLevel = region->dstSubresource.mipLevel, + .baseArrayLayer = + MAX2(region->dstOffset.z, + region->dstSubresource.baseArrayLayer) + + layer_offs, + .layerCount = per_layer ? 1 : layers, + .levelCount = 1, + }, + }; + + result = + vk_meta_create_image_view(cmd, meta, &dst_view_info, &dst_view); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + const VkDescriptorImageInfo dst_info = { + .imageView = dst_view, + .imageLayout = info->dstImageLayout, + }; + + desc_writes[0] = (VkWriteDescriptorSet){ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = 0, + .dstBinding = BINDING_OUTPUT, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = 1, + .pImageInfo = &dst_info, + }; + + desc_writes[1] = (VkWriteDescriptorSet){ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = 0, + .dstBinding = BINDING_INPUT, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .descriptorCount = 1, + .pImageInfo = &src_info, + }; + + disp->CmdPushDescriptorSetKHR( + vk_command_buffer_to_handle(cmd), VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline_layout, 0, ARRAY_SIZE(desc_writes), desc_writes); + + VkPipeline pipeline; + result = get_image_copy_pipeline(device, meta, &key, + pipeline_layout, &pipeline); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd), + VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + enum pipe_format p_src_fmt = + vk_format_to_pipe_format(src_image->format); + enum pipe_format p_dst_fmt = + vk_format_to_pipe_format(dst_image->format); + enum pipe_format p_format = vk_format_to_pipe_format(aspect_fmt); + + struct vk_meta_push_data push = { + .src_offset_el[0] = + util_format_get_nblocksx(p_src_fmt, region->srcOffset.x), + .src_offset_el[1] = + util_format_get_nblocksy(p_src_fmt, region->srcOffset.y), + + .dst_offset_el[0] = + util_format_get_nblocksx(p_dst_fmt, region->dstOffset.x), + .dst_offset_el[1] = + util_format_get_nblocksy(p_dst_fmt, region->dstOffset.y), + + .grid_el[0] = + util_format_get_nblocksx(p_format, region->extent.width), + .grid_el[1] = + util_format_get_nblocksy(p_format, region->extent.height), + .grid_el[2] = per_layer ? 1 : layers, + }; + + disp->CmdPushConstants(vk_command_buffer_to_handle(cmd), + pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(push), &push); + + disp->CmdDispatch(vk_command_buffer_to_handle(cmd), + DIV_ROUND_UP(push.grid_el[0], 32), + DIV_ROUND_UP(push.grid_el[1], 32), + push.grid_el[2]); + } + } + } +} + +struct vk_meta_image_to_buffer_push_data { + uint32_t dest_offset_el; +}; + +#define get_image_push(b, name) \ + nir_load_push_constant( \ + b, 1, sizeof(((struct vk_meta_image_to_buffer_push_data *)0)->name) * 8, \ + nir_imm_int(b, \ + offsetof(struct vk_meta_image_to_buffer_push_data, name))) + +enum copy_source { + COPY_SOURCE_PATTERN, + COPY_SOURCE_BUFFER, +}; + +struct vk_meta_buffer_copy_key { + enum vk_meta_object_key_type key_type; + enum copy_source source; + + /* Power-of-two block size for the transfer, range [1, 16] */ + uint8_t blocksize; + uint8_t pad[3]; +}; +static_assert(sizeof(struct vk_meta_buffer_copy_key) == 12, "packed"); + +/* XXX: TODO: move to common */ +/* Copyright © Microsoft Corporation */ +static nir_def * +dzn_nir_create_bo_desc(nir_builder *b, nir_variable_mode mode, + uint32_t desc_set, uint32_t binding, const char *name, + unsigned access, const struct glsl_type *dummy_type) +{ + nir_variable *var = nir_variable_create(b->shader, mode, dummy_type, name); + var->data.descriptor_set = desc_set; + var->data.binding = binding; + var->data.access = access; + + assert(mode == nir_var_mem_ubo || mode == nir_var_mem_ssbo); + if (mode == nir_var_mem_ubo) + b->shader->info.num_ubos++; + else + b->shader->info.num_ssbos++; + + VkDescriptorType desc_type = var->data.mode == nir_var_mem_ubo + ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER + : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + nir_address_format addr_format = + nir_address_format_64bit_global_32bit_offset; /* XXX */ + nir_def *index = nir_vulkan_resource_index( + b, nir_address_format_num_components(addr_format), + nir_address_format_bit_size(addr_format), nir_imm_int(b, 0), + .desc_set = desc_set, .binding = binding, .desc_type = desc_type); + + nir_def *desc = nir_load_vulkan_descriptor( + b, nir_address_format_num_components(addr_format), + nir_address_format_bit_size(addr_format), index, .desc_type = desc_type); + + return desc; +} + +static const struct glsl_type * +type_for_blocksize(uint8_t blocksize) +{ + assert(util_is_power_of_two_nonzero(blocksize) && blocksize <= 16); + + if (blocksize > 4) + return glsl_vector_type(GLSL_TYPE_UINT, blocksize / 4); + else + return glsl_uintN_t_type(8 * blocksize); +} + +static nir_shader * +build_buffer_copy_shader(const struct vk_meta_buffer_copy_key *key) +{ + nir_builder build = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, + "vk-meta-copy-to-buffer"); + nir_builder *b = &build; + + const struct glsl_type *type = + glsl_array_type(type_for_blocksize(key->blocksize), 0, key->blocksize); + + nir_def *index = nir_channel(b, nir_load_global_invocation_id(b, 32), 0); + nir_def *value; + + if (key->source == COPY_SOURCE_BUFFER) { + nir_def *ubo = + dzn_nir_create_bo_desc(b, nir_var_mem_ubo, 0, BINDING_INPUT, "source", + ACCESS_NON_WRITEABLE, type); + nir_deref_instr *ubo_deref = + nir_build_deref_cast(b, ubo, nir_var_mem_ubo, type, key->blocksize); + + nir_deref_instr *element_deref = nir_build_deref_array( + b, ubo_deref, nir_u2uN(b, index, ubo_deref->def.bit_size)); + + value = nir_load_deref(b, element_deref); + } else { + nir_def *pattern = nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0)); + + assert(key->blocksize >= 4 && "fills at least 32-bit"); + value = nir_replicate(b, pattern, key->blocksize / 4); + } + + /* Write out raw bytes to SSBO */ + nir_def *ssbo = + dzn_nir_create_bo_desc(b, nir_var_mem_ssbo, 0, BINDING_OUTPUT, + "destination", ACCESS_NON_READABLE, type); + + nir_deref_instr *ssbo_deref = + nir_build_deref_cast(b, ssbo, nir_var_mem_ssbo, type, key->blocksize); + + nir_deref_instr *element_deref = nir_build_deref_array( + b, ssbo_deref, nir_u2uN(b, index, ssbo_deref->def.bit_size)); + + nir_store_deref(b, element_deref, value, + nir_component_mask(value->num_components)); + + return b->shader; +} + +static VkResult +get_buffer_copy_descriptor_set_layout(struct vk_device *device, + struct vk_meta_device *meta, + VkDescriptorSetLayout *layout_out, + enum copy_source source) +{ + const char buffer_key[] = "vk-meta-buffer-copy-descriptor-set-layout"; + const char fill_key[] = "vk-meta-fill__-copy-descriptor-set-layout"; + + static_assert(sizeof(buffer_key) == sizeof(fill_key)); + const char *key = source == COPY_SOURCE_BUFFER ? buffer_key : fill_key; + + VkDescriptorSetLayout from_cache = + vk_meta_lookup_descriptor_set_layout(meta, key, sizeof(buffer_key)); + if (from_cache != VK_NULL_HANDLE) { + *layout_out = from_cache; + return VK_SUCCESS; + } + + const VkDescriptorSetLayoutBinding bindings[] = { + { + .binding = BINDING_OUTPUT, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .binding = BINDING_INPUT, + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + + const VkDescriptorSetLayoutCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = ARRAY_SIZE(bindings), + .pBindings = bindings, + }; + + return vk_meta_create_descriptor_set_layout(device, meta, &info, key, + sizeof(key), layout_out); +} + +static VkResult +get_buffer_copy_pipeline_layout(struct vk_device *device, + struct vk_meta_device *meta, + struct vk_meta_buffer_copy_key *key, + VkDescriptorSetLayout set_layout, + VkPipelineLayout *layout_out) +{ + const char copy_key[] = "vk-meta-buffer-copy-pipeline-layout"; + const char fill_key[] = "vk-meta-buffer-fill-pipeline-layout"; + const char cimg_key[] = "vk-meta-buffer-cimg-pipeline-layout"; + + STATIC_ASSERT(sizeof(copy_key) == sizeof(fill_key)); + STATIC_ASSERT(sizeof(copy_key) == sizeof(cimg_key)); + const char *pipeline_key = + key->source == COPY_SOURCE_BUFFER ? copy_key : fill_key; + + VkPipelineLayout from_cache = + vk_meta_lookup_pipeline_layout(meta, pipeline_key, sizeof(copy_key)); + if (from_cache != VK_NULL_HANDLE) { + *layout_out = from_cache; + return VK_SUCCESS; + } + + VkPipelineLayoutCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &set_layout, + }; + + size_t push_size = 0; + if (key->source == COPY_SOURCE_PATTERN) + push_size = sizeof(uint32_t); + + const VkPushConstantRange push_range = { + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = push_size, + }; + + if (push_size) { + info.pushConstantRangeCount = 1; + info.pPushConstantRanges = &push_range; + } + + return vk_meta_create_pipeline_layout(device, meta, &info, pipeline_key, + sizeof(copy_key), layout_out); +} + +static VkResult +get_buffer_copy_pipeline(struct vk_device *device, struct vk_meta_device *meta, + const struct vk_meta_buffer_copy_key *key, + VkPipelineLayout layout, VkPipeline *pipeline_out) +{ + VkPipeline from_cache = vk_meta_lookup_pipeline(meta, key, sizeof(*key)); + if (from_cache != VK_NULL_HANDLE) { + *pipeline_out = from_cache; + return VK_SUCCESS; + } + + const VkPipelineShaderStageNirCreateInfoMESA nir_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_NIR_CREATE_INFO_MESA, + .nir = build_buffer_copy_shader(key), + }; + const VkPipelineShaderStageCreateInfo cs_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = &nir_info, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .pName = "main", + }; + + const VkComputePipelineCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = cs_info, + .layout = layout, + }; + + VkResult result = vk_meta_create_compute_pipeline( + device, meta, &info, key, sizeof(*key), pipeline_out); + ralloc_free(nir_info.nir); + + return result; +} + +static unsigned +alignment_of(unsigned x) +{ + return 1 << MIN2(__builtin_ctz(x), 31); +} + +struct copy_desc { + enum copy_source source; + + union { + uint32_t pattern; + + struct { + struct vk_buffer *source; + VkDeviceSize srcOffset; + } buffer; + + struct { + struct vk_image *image; + VkDescriptorImageInfo *info; + VkFormat format; + struct vk_meta_image_to_buffer_push_data push; + } image; + }; +}; + +static void +do_copy(struct vk_command_buffer *cmd, struct vk_meta_device *meta, size_t size, + struct vk_buffer *dest, VkDeviceSize dstOffset, struct copy_desc *desc) +{ + struct vk_device *device = cmd->base.device; + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + VkResult result; + + /* The "alignment" of the copy is the maximum alignment that all accesses + * within the copy will satsify. + */ + unsigned alignment = MIN2(alignment_of(dstOffset), alignment_of(size)); + + if (desc->source == COPY_SOURCE_BUFFER) + alignment = MIN2(alignment, alignment_of(desc->buffer.srcOffset)); + + struct vk_meta_buffer_copy_key key = { + .key_type = VK_META_OBJECT_KEY_FILL_PIPELINE, + .source = desc->source, + .blocksize = MIN2(alignment, 16), + }; + + VkDescriptorSetLayout set_layout; + result = get_buffer_copy_descriptor_set_layout(device, meta, &set_layout, + desc->source); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + VkPipelineLayout pipeline_layout; + result = get_buffer_copy_pipeline_layout(device, meta, &key, set_layout, + &pipeline_layout); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + VkDescriptorBufferInfo buffer_infos[2]; + VkWriteDescriptorSet desc_writes[2]; + + for (unsigned i = 0; i < 2; ++i) { + bool is_dest = (i == BINDING_OUTPUT); + + if (!is_dest && desc->source != COPY_SOURCE_BUFFER) + continue; + + buffer_infos[i] = (VkDescriptorBufferInfo){ + .buffer = vk_buffer_to_handle(is_dest ? dest : desc->buffer.source), + .offset = is_dest ? dstOffset : desc->buffer.srcOffset, + .range = size, + }; + + desc_writes[i] = (VkWriteDescriptorSet){ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = 0, + .dstBinding = i, + .descriptorType = is_dest ? VK_DESCRIPTOR_TYPE_STORAGE_BUFFER + : VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .descriptorCount = 1, + .pBufferInfo = &buffer_infos[i], + }; + } + + unsigned desc_count = desc->source == COPY_SOURCE_PATTERN ? 1 : 2; + disp->CmdPushDescriptorSetKHR(vk_command_buffer_to_handle(cmd), + VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline_layout, 0, desc_count, desc_writes); + + VkPipeline pipeline; + result = + get_buffer_copy_pipeline(device, meta, &key, pipeline_layout, &pipeline); + if (unlikely(result != VK_SUCCESS)) { + vk_command_buffer_set_error(cmd, result); + return; + } + + disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd), + VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + + if (desc->source == COPY_SOURCE_PATTERN) { + disp->CmdPushConstants(vk_command_buffer_to_handle(cmd), pipeline_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(uint32_t), + &desc->pattern); + } + + disp->CmdDispatch(vk_command_buffer_to_handle(cmd), size / key.blocksize, 1, + 1); +} + +static void +hk_meta_fill_buffer(struct vk_command_buffer *cmd, struct vk_meta_device *meta, + struct vk_buffer *dest, VkDeviceSize dstOffset, + VkDeviceSize dstRange, uint32_t data) +{ + size_t size = ROUND_DOWN_TO(vk_buffer_range(dest, dstOffset, dstRange), 4); + dstOffset = ROUND_DOWN_TO(dstOffset, 4); + + do_copy(cmd, meta, size, dest, dstOffset, + &(struct copy_desc){ + .source = COPY_SOURCE_PATTERN, + .pattern = data, + }); +} + +static void +hk_meta_update_buffer(struct vk_command_buffer *cmd, + struct vk_meta_device *meta, struct vk_buffer *dest, + VkDeviceSize dstOffset, VkDeviceSize dstRange, + const void *data) +{ + /* Create a buffer to hold the data */ + const VkBufferCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = vk_buffer_range(dest, dstOffset, dstRange), + .usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + .queueFamilyIndexCount = 1, + .pQueueFamilyIndices = &cmd->pool->queue_family_index, + }; + + VkBuffer buffer; + VkResult result = vk_meta_create_buffer(cmd, meta, &info, &buffer); + if (unlikely(result != VK_SUCCESS)) + return; + + /* Map the buffer for CPU access */ + void *map; + result = meta->cmd_bind_map_buffer(cmd, meta, buffer, &map); + if (unlikely(result != VK_SUCCESS)) + return; + + /* Copy from the CPU input to the staging buffer */ + memcpy(map, data, info.size); + + /* Copy between the buffers on the GPU */ + VK_FROM_HANDLE(vk_buffer, buffer_, buffer); + size_t size = ROUND_DOWN_TO(vk_buffer_range(dest, dstOffset, dstRange), 4); + dstOffset = ROUND_DOWN_TO(dstOffset, 4); + + do_copy(cmd, meta, size, dest, dstOffset, + &(struct copy_desc){ + .source = COPY_SOURCE_BUFFER, + .buffer.source = buffer_, + }); +} + +static void +hk_meta_copy_buffer2(struct vk_command_buffer *cmd, struct vk_meta_device *meta, + const VkCopyBufferInfo2 *pCopyBufferInfo) +{ + VK_FROM_HANDLE(vk_buffer, dst, pCopyBufferInfo->dstBuffer); + VK_FROM_HANDLE(vk_buffer, src, pCopyBufferInfo->srcBuffer); + + for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) { + const VkBufferCopy2 *copy = &pCopyBufferInfo->pRegions[i]; + + do_copy(cmd, meta, copy->size, dst, copy->dstOffset, + &(struct copy_desc){ + .source = COPY_SOURCE_BUFFER, + .buffer.source = src, + .buffer.srcOffset = copy->srcOffset, + }); + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdBlitImage2(VkCommandBuffer commandBuffer, + const VkBlitImageInfo2 *pBlitImageInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + struct hk_meta_save save; + hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS); + vk_meta_blit_image2(&cmd->vk, &dev->meta, pBlitImageInfo); + hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdResolveImage2(VkCommandBuffer commandBuffer, + const VkResolveImageInfo2 *pResolveImageInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + struct hk_meta_save save; + hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS); + vk_meta_resolve_image2(&cmd->vk, &dev->meta, pResolveImageInfo); + hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS); +} + +void +hk_meta_resolve_rendering(struct hk_cmd_buffer *cmd, + const VkRenderingInfo *pRenderingInfo) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + struct hk_meta_save save; + hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS); + vk_meta_resolve_rendering(&cmd->vk, &dev->meta, pRenderingInfo); + hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdCopyBuffer2(VkCommandBuffer commandBuffer, + const VkCopyBufferInfo2 *pCopyBufferInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + struct hk_meta_save save; + hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); + hk_meta_copy_buffer2(&cmd->vk, &dev->meta, pCopyBufferInfo); + hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer, + const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + struct hk_meta_save save; + hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); + hk_meta_copy_buffer_to_image2(&cmd->vk, &dev->meta, pCopyBufferToImageInfo); + hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer, + const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + struct hk_meta_save save; + hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); + hk_meta_copy_image_to_buffer2(&cmd->vk, &dev->meta, pCopyImageToBufferInfo); + hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdCopyImage2(VkCommandBuffer commandBuffer, + const VkCopyImageInfo2 *pCopyImageInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + struct hk_meta_save save; + hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); + hk_meta_copy_image2(&cmd->vk, &dev->meta, pCopyImageInfo); + hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, + VkDeviceSize dstOffset, VkDeviceSize dstRange, uint32_t data) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(vk_buffer, buffer, dstBuffer); + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + struct hk_meta_save save; + hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); + hk_meta_fill_buffer(&cmd->vk, &dev->meta, buffer, dstOffset, dstRange, data); + hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, + VkDeviceSize dstOffset, VkDeviceSize dstRange, + const void *pData) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(vk_buffer, buffer, dstBuffer); + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + struct hk_meta_save save; + hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); + hk_meta_update_buffer(&cmd->vk, &dev->meta, buffer, dstOffset, dstRange, + pData); + hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdClearAttachments(VkCommandBuffer commandBuffer, uint32_t attachmentCount, + const VkClearAttachment *pAttachments, + uint32_t rectCount, const VkClearRect *pRects) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + struct vk_meta_rendering_info render_info; + hk_meta_init_render(cmd, &render_info); + + struct hk_meta_save save; + hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS); + vk_meta_clear_attachments(&cmd->vk, &dev->meta, &render_info, + attachmentCount, pAttachments, rectCount, pRects); + hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS); +} diff --git a/src/asahi/vulkan/hk_cmd_pool.c b/src/asahi/vulkan/hk_cmd_pool.c new file mode 100644 index 00000000000..a3f2a85468a --- /dev/null +++ b/src/asahi/vulkan/hk_cmd_pool.c @@ -0,0 +1,146 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_cmd_pool.h" +#include "asahi/lib/agx_bo.h" + +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_physical_device.h" + +static VkResult +hk_cmd_bo_create(struct hk_cmd_pool *pool, bool usc, struct hk_cmd_bo **bo_out) +{ + struct hk_device *dev = hk_cmd_pool_device(pool); + struct hk_cmd_bo *bo; + + bo = vk_zalloc(&pool->vk.alloc, sizeof(*bo), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (bo == NULL) + return vk_error(pool, VK_ERROR_OUT_OF_HOST_MEMORY); + + bo->bo = agx_bo_create(&dev->dev, HK_CMD_BO_SIZE, usc ? AGX_BO_LOW_VA : 0, + "Command pool"); + if (bo->bo == NULL) { + vk_free(&pool->vk.alloc, bo); + return vk_error(pool, VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + + bo->map = bo->bo->ptr.cpu; + + *bo_out = bo; + return VK_SUCCESS; +} + +static void +hk_cmd_bo_destroy(struct hk_cmd_pool *pool, struct hk_cmd_bo *bo) +{ + agx_bo_unreference(bo->bo); + vk_free(&pool->vk.alloc, bo); +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateCommandPool(VkDevice _device, + const VkCommandPoolCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkCommandPool *pCmdPool) +{ + VK_FROM_HANDLE(hk_device, device, _device); + struct hk_cmd_pool *pool; + + pool = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pool == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + VkResult result = + vk_command_pool_init(&device->vk, &pool->vk, pCreateInfo, pAllocator); + if (result != VK_SUCCESS) { + vk_free2(&device->vk.alloc, pAllocator, pool); + return result; + } + + list_inithead(&pool->free_bos); + list_inithead(&pool->free_usc_bos); + + *pCmdPool = hk_cmd_pool_to_handle(pool); + + return VK_SUCCESS; +} + +static void +hk_cmd_pool_destroy_bos(struct hk_cmd_pool *pool) +{ + list_for_each_entry_safe(struct hk_cmd_bo, bo, &pool->free_bos, link) + hk_cmd_bo_destroy(pool, bo); + + list_inithead(&pool->free_bos); + + list_for_each_entry_safe(struct hk_cmd_bo, bo, &pool->free_usc_bos, link) + hk_cmd_bo_destroy(pool, bo); + + list_inithead(&pool->free_usc_bos); +} + +VkResult +hk_cmd_pool_alloc_bo(struct hk_cmd_pool *pool, bool usc, + struct hk_cmd_bo **bo_out) +{ + struct hk_cmd_bo *bo = NULL; + if (usc) { + if (!list_is_empty(&pool->free_usc_bos)) + bo = list_first_entry(&pool->free_usc_bos, struct hk_cmd_bo, link); + } else { + if (!list_is_empty(&pool->free_bos)) + bo = list_first_entry(&pool->free_bos, struct hk_cmd_bo, link); + } + if (bo) { + list_del(&bo->link); + *bo_out = bo; + return VK_SUCCESS; + } + + return hk_cmd_bo_create(pool, usc, bo_out); +} + +void +hk_cmd_pool_free_bo_list(struct hk_cmd_pool *pool, struct list_head *bos) +{ + list_splicetail(bos, &pool->free_bos); + list_inithead(bos); +} + +void +hk_cmd_pool_free_usc_bo_list(struct hk_cmd_pool *pool, struct list_head *bos) +{ + list_splicetail(bos, &pool->free_usc_bos); + list_inithead(bos); +} + +VKAPI_ATTR void VKAPI_CALL +hk_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_device, device, _device); + VK_FROM_HANDLE(hk_cmd_pool, pool, commandPool); + + if (!pool) + return; + + vk_command_pool_finish(&pool->vk); + hk_cmd_pool_destroy_bos(pool); + vk_free2(&device->vk.alloc, pAllocator, pool); +} + +VKAPI_ATTR void VKAPI_CALL +hk_TrimCommandPool(VkDevice device, VkCommandPool commandPool, + VkCommandPoolTrimFlags flags) +{ + VK_FROM_HANDLE(hk_cmd_pool, pool, commandPool); + + vk_command_pool_trim(&pool->vk, flags); + hk_cmd_pool_destroy_bos(pool); +} diff --git a/src/asahi/vulkan/hk_cmd_pool.h b/src/asahi/vulkan/hk_cmd_pool.h new file mode 100644 index 00000000000..dbac305f833 --- /dev/null +++ b/src/asahi/vulkan/hk_cmd_pool.h @@ -0,0 +1,49 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "hk_private.h" + +#include "vk_command_pool.h" + +/* XXX: FIXME */ +#define HK_CMD_BO_SIZE 1024 * 1024 + +/* Recyclable command buffer BO, used for both push buffers and upload */ +struct hk_cmd_bo { + struct agx_bo *bo; + + void *map; + + /** Link in hk_cmd_pool::free_bos or hk_cmd_buffer::bos */ + struct list_head link; +}; + +struct hk_cmd_pool { + struct vk_command_pool vk; + + /** List of hk_cmd_bo */ + struct list_head free_bos; + struct list_head free_usc_bos; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_cmd_pool, vk.base, VkCommandPool, + VK_OBJECT_TYPE_COMMAND_POOL) + +static inline struct hk_device * +hk_cmd_pool_device(struct hk_cmd_pool *pool) +{ + return (struct hk_device *)pool->vk.base.device; +} + +VkResult hk_cmd_pool_alloc_bo(struct hk_cmd_pool *pool, bool force_usc, + struct hk_cmd_bo **bo_out); + +void hk_cmd_pool_free_bo_list(struct hk_cmd_pool *pool, struct list_head *bos); +void hk_cmd_pool_free_usc_bo_list(struct hk_cmd_pool *pool, + struct list_head *bos); diff --git a/src/asahi/vulkan/hk_descriptor_set.c b/src/asahi/vulkan/hk_descriptor_set.c new file mode 100644 index 00000000000..b59a9ac4b57 --- /dev/null +++ b/src/asahi/vulkan/hk_descriptor_set.c @@ -0,0 +1,794 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_descriptor_set.h" +#include "asahi/lib/agx_bo.h" +#include "vulkan/vulkan_core.h" + +#include "hk_buffer.h" +#include "hk_buffer_view.h" +#include "hk_descriptor_set_layout.h" +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_image_view.h" +#include "hk_physical_device.h" +#include "hk_sampler.h" + +static inline uint32_t +align_u32(uint32_t v, uint32_t a) +{ + assert(a != 0 && a == (a & -a)); + return (v + a - 1) & ~(a - 1); +} + +static inline void * +desc_ubo_data(struct hk_descriptor_set *set, uint32_t binding, uint32_t elem, + uint32_t *size_out) +{ + const struct hk_descriptor_set_binding_layout *binding_layout = + &set->layout->binding[binding]; + + uint32_t offset = binding_layout->offset + elem * binding_layout->stride; + assert(offset < set->size); + + if (size_out != NULL) + *size_out = set->size - offset; + + return (char *)set->mapped_ptr + offset; +} + +static void +write_desc(struct hk_descriptor_set *set, uint32_t binding, uint32_t elem, + const void *desc_data, size_t desc_size) +{ + ASSERTED uint32_t dst_size; + void *dst = desc_ubo_data(set, binding, elem, &dst_size); + assert(desc_size <= dst_size); + memcpy(dst, desc_data, desc_size); +} + +static void +write_sampled_image_view_desc(struct hk_descriptor_set *set, + const VkDescriptorImageInfo *const info, + uint32_t binding, uint32_t elem, + VkDescriptorType descriptor_type) +{ + struct hk_sampled_image_descriptor desc[3] = {}; + assert(HK_NULL_TEX_OFFSET == 0 && "zero initialized so null descs implicit"); + + uint8_t plane_count = 1; + bool ia = (descriptor_type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT); + + if (descriptor_type != VK_DESCRIPTOR_TYPE_SAMPLER && info && + info->imageView != VK_NULL_HANDLE) { + VK_FROM_HANDLE(hk_image_view, view, info->imageView); + + plane_count = view->plane_count; + for (uint8_t plane = 0; plane < plane_count; plane++) { + unsigned index = ia ? view->planes[plane].ia_desc_index + : view->planes[plane].sampled_desc_index; + + assert(index < (1 << 20)); + desc[plane].image_offset = index * HK_IMAGE_STRIDE; + } + } + + if (descriptor_type == VK_DESCRIPTOR_TYPE_SAMPLER || + descriptor_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) { + const struct hk_descriptor_set_binding_layout *binding_layout = + &set->layout->binding[binding]; + + struct hk_sampler *sampler; + if (binding_layout->immutable_samplers) { + sampler = binding_layout->immutable_samplers[elem]; + } else { + sampler = hk_sampler_from_handle(info->sampler); + } + + if (sampler->has_border) + assert(plane_count == 1); + else + plane_count = MAX2(plane_count, sampler->plane_count); + + for (uint8_t plane = 0; plane < plane_count; plane++) { + /* We need to replicate the last sampler plane out to all image + * planes due to sampler table entry limitations. See + * hk_CreateSampler in hk_sampler.c for more details. + */ + uint8_t sampler_plane = MIN2(plane, sampler->plane_count - 1); + assert(sampler->planes[sampler_plane].hw->index < (1 << 12)); + + /* All bindless samplers are indexed from 28 in hardware, add here so + * we don't have to care in the shader. + */ + desc[plane].sampler_index = + sampler->planes[sampler_plane].hw->index + 28; + desc[plane].lod_bias_fp16 = sampler->lod_bias_fp16; + desc[plane].has_border = sampler->has_border; + } + + if (sampler->has_border) { + assert(sampler->plane_count == 2); + desc[0].clamp_0_sampler_index = sampler->planes[1].hw->index + 28; + + static_assert(sizeof(desc[0].border) == sizeof(sampler->custom_border), + "fixed format"); + + memcpy(desc[0].border, sampler->custom_border.uint32, + sizeof(sampler->custom_border)); + } + } + write_desc(set, binding, elem, desc, sizeof(desc[0]) * plane_count); +} + +static void +write_storage_image_view_desc(struct hk_descriptor_set *set, + const VkDescriptorImageInfo *const info, + uint32_t binding, uint32_t elem) +{ + struct hk_storage_image_descriptor desc = {}; + + if (info && info->imageView != VK_NULL_HANDLE) { + VK_FROM_HANDLE(hk_image_view, view, info->imageView); + + /* Storage images are always single plane */ + assert(view->plane_count == 1); + uint8_t plane = 0; + + desc.tex_offset = + view->planes[plane].ro_storage_desc_index * HK_IMAGE_STRIDE; + + desc.pbe_offset = + view->planes[plane].storage_desc_index * HK_IMAGE_STRIDE; + } else { + desc.tex_offset = HK_NULL_TEX_OFFSET; + desc.pbe_offset = HK_NULL_PBE_OFFSET; + } + + write_desc(set, binding, elem, &desc, sizeof(desc)); +} + +static void +write_buffer_desc(struct hk_descriptor_set *set, + const VkDescriptorBufferInfo *const info, uint32_t binding, + uint32_t elem) +{ + VK_FROM_HANDLE(hk_buffer, buffer, info->buffer); + + const struct hk_addr_range addr_range = + hk_buffer_addr_range(buffer, info->offset, info->range); + assert(addr_range.range <= UINT32_MAX); + + const struct hk_buffer_address desc = { + .base_addr = addr_range.addr, + .size = addr_range.range, + }; + write_desc(set, binding, elem, &desc, sizeof(desc)); +} + +static void +write_dynamic_buffer_desc(struct hk_descriptor_set *set, + const VkDescriptorBufferInfo *const info, + uint32_t binding, uint32_t elem) +{ + VK_FROM_HANDLE(hk_buffer, buffer, info->buffer); + const struct hk_descriptor_set_binding_layout *binding_layout = + &set->layout->binding[binding]; + + const struct hk_addr_range addr_range = + hk_buffer_addr_range(buffer, info->offset, info->range); + assert(addr_range.range <= UINT32_MAX); + + struct hk_buffer_address *desc = + &set->dynamic_buffers[binding_layout->dynamic_buffer_index + elem]; + *desc = (struct hk_buffer_address){ + .base_addr = addr_range.addr, + .size = addr_range.range, + }; +} + +static void +write_buffer_view_desc(struct hk_descriptor_set *set, + const VkBufferView bufferView, uint32_t binding, + uint32_t elem) +{ + struct hk_buffer_view_descriptor desc = {}; + if (bufferView != VK_NULL_HANDLE) { + VK_FROM_HANDLE(hk_buffer_view, view, bufferView); + + assert(view->tex_desc_index < (1 << 20)); + assert(view->pbe_desc_index < (1 << 20)); + + desc.tex_offset = view->tex_desc_index * HK_IMAGE_STRIDE; + desc.pbe_offset = view->pbe_desc_index * HK_IMAGE_STRIDE; + } else { + desc.tex_offset = HK_NULL_TEX_OFFSET; + desc.pbe_offset = HK_NULL_PBE_OFFSET; + } + + write_desc(set, binding, elem, &desc, sizeof(desc)); +} + +static void +write_inline_uniform_data(struct hk_descriptor_set *set, + const VkWriteDescriptorSetInlineUniformBlock *info, + uint32_t binding, uint32_t offset) +{ + assert(set->layout->binding[binding].stride == 1); + write_desc(set, binding, offset, info->pData, info->dataSize); +} + +VKAPI_ATTR void VKAPI_CALL +hk_UpdateDescriptorSets(VkDevice device, uint32_t descriptorWriteCount, + const VkWriteDescriptorSet *pDescriptorWrites, + uint32_t descriptorCopyCount, + const VkCopyDescriptorSet *pDescriptorCopies) +{ + for (uint32_t w = 0; w < descriptorWriteCount; w++) { + const VkWriteDescriptorSet *write = &pDescriptorWrites[w]; + VK_FROM_HANDLE(hk_descriptor_set, set, write->dstSet); + + switch (write->descriptorType) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + write_sampled_image_view_desc( + set, write->pImageInfo + j, write->dstBinding, + write->dstArrayElement + j, write->descriptorType); + } + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + write_storage_image_view_desc(set, write->pImageInfo + j, + write->dstBinding, + write->dstArrayElement + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + write_buffer_view_desc(set, write->pTexelBufferView[j], + write->dstBinding, + write->dstArrayElement + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + write_buffer_desc(set, write->pBufferInfo + j, write->dstBinding, + write->dstArrayElement + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + write_dynamic_buffer_desc(set, write->pBufferInfo + j, + write->dstBinding, + write->dstArrayElement + j); + } + break; + + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: { + const VkWriteDescriptorSetInlineUniformBlock *write_inline = + vk_find_struct_const(write->pNext, + WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK); + assert(write_inline->dataSize == write->descriptorCount); + write_inline_uniform_data(set, write_inline, write->dstBinding, + write->dstArrayElement); + break; + } + + default: + break; + } + } + + for (uint32_t i = 0; i < descriptorCopyCount; i++) { + const VkCopyDescriptorSet *copy = &pDescriptorCopies[i]; + VK_FROM_HANDLE(hk_descriptor_set, src, copy->srcSet); + VK_FROM_HANDLE(hk_descriptor_set, dst, copy->dstSet); + + const struct hk_descriptor_set_binding_layout *src_binding_layout = + &src->layout->binding[copy->srcBinding]; + const struct hk_descriptor_set_binding_layout *dst_binding_layout = + &dst->layout->binding[copy->dstBinding]; + + if (dst_binding_layout->stride > 0 && src_binding_layout->stride > 0) { + for (uint32_t j = 0; j < copy->descriptorCount; j++) { + ASSERTED uint32_t dst_max_size, src_max_size; + void *dst_map = desc_ubo_data( + dst, copy->dstBinding, copy->dstArrayElement + j, &dst_max_size); + const void *src_map = desc_ubo_data( + src, copy->srcBinding, copy->srcArrayElement + j, &src_max_size); + const uint32_t copy_size = + MIN2(dst_binding_layout->stride, src_binding_layout->stride); + assert(copy_size <= dst_max_size && copy_size <= src_max_size); + memcpy(dst_map, src_map, copy_size); + } + } + + switch (src_binding_layout->type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + const uint32_t dst_dyn_start = + dst_binding_layout->dynamic_buffer_index + copy->dstArrayElement; + const uint32_t src_dyn_start = + src_binding_layout->dynamic_buffer_index + copy->srcArrayElement; + typed_memcpy(&dst->dynamic_buffers[dst_dyn_start], + &src->dynamic_buffers[src_dyn_start], + copy->descriptorCount); + break; + } + default: + break; + } + } +} + +void +hk_push_descriptor_set_update(struct hk_push_descriptor_set *push_set, + struct hk_descriptor_set_layout *layout, + uint32_t write_count, + const VkWriteDescriptorSet *writes) +{ + assert(layout->non_variable_descriptor_buffer_size < sizeof(push_set->data)); + struct hk_descriptor_set set = { + .layout = layout, + .size = sizeof(push_set->data), + .mapped_ptr = push_set->data, + }; + + for (uint32_t w = 0; w < write_count; w++) { + const VkWriteDescriptorSet *write = &writes[w]; + assert(write->dstSet == VK_NULL_HANDLE); + + switch (write->descriptorType) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + write_sampled_image_view_desc( + &set, write->pImageInfo + j, write->dstBinding, + write->dstArrayElement + j, write->descriptorType); + } + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + write_storage_image_view_desc(&set, write->pImageInfo + j, + write->dstBinding, + write->dstArrayElement + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + write_buffer_view_desc(&set, write->pTexelBufferView[j], + write->dstBinding, + write->dstArrayElement + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + write_buffer_desc(&set, write->pBufferInfo + j, write->dstBinding, + write->dstArrayElement + j); + } + break; + + default: + break; + } + } +} + +static void hk_descriptor_pool_free(struct hk_descriptor_pool *pool, + uint64_t addr, uint64_t size); + +static void +hk_descriptor_set_destroy(struct hk_device *dev, + struct hk_descriptor_pool *pool, + struct hk_descriptor_set *set) +{ + list_del(&set->link); + if (set->size > 0) + hk_descriptor_pool_free(pool, set->addr, set->size); + vk_descriptor_set_layout_unref(&dev->vk, &set->layout->vk); + + vk_object_free(&dev->vk, NULL, set); +} + +static void +hk_destroy_descriptor_pool(struct hk_device *dev, + const VkAllocationCallbacks *pAllocator, + struct hk_descriptor_pool *pool) +{ + list_for_each_entry_safe(struct hk_descriptor_set, set, &pool->sets, link) + hk_descriptor_set_destroy(dev, pool, set); + + util_vma_heap_finish(&pool->heap); + agx_bo_unreference(pool->bo); + + vk_object_free(&dev->vk, pAllocator, pool); +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateDescriptorPool(VkDevice _device, + const VkDescriptorPoolCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkDescriptorPool *pDescriptorPool) +{ + VK_FROM_HANDLE(hk_device, dev, _device); + struct hk_physical_device *pdev = hk_device_physical(dev); + struct hk_descriptor_pool *pool; + + pool = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pool), + VK_OBJECT_TYPE_DESCRIPTOR_POOL); + if (!pool) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + list_inithead(&pool->sets); + + const VkMutableDescriptorTypeCreateInfoEXT *mutable_info = + vk_find_struct_const(pCreateInfo->pNext, + MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT); + + uint32_t max_align = 0; + for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) { + const VkMutableDescriptorTypeListEXT *type_list = NULL; + if (pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT && + mutable_info && i < mutable_info->mutableDescriptorTypeListCount) + type_list = &mutable_info->pMutableDescriptorTypeLists[i]; + + uint32_t stride, alignment; + hk_descriptor_stride_align_for_type(pdev, pCreateInfo->pPoolSizes[i].type, + type_list, &stride, &alignment); + max_align = MAX2(max_align, alignment); + } + + uint64_t bo_size = 0; + for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) { + const VkMutableDescriptorTypeListEXT *type_list = NULL; + if (pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT && + mutable_info && i < mutable_info->mutableDescriptorTypeListCount) + type_list = &mutable_info->pMutableDescriptorTypeLists[i]; + + uint32_t stride, alignment; + hk_descriptor_stride_align_for_type(pdev, pCreateInfo->pPoolSizes[i].type, + type_list, &stride, &alignment); + bo_size += + MAX2(stride, max_align) * pCreateInfo->pPoolSizes[i].descriptorCount; + } + + /* Individual descriptor sets are aligned to the min UBO alignment to + * ensure that we don't end up with unaligned data access in any shaders. + * This means that each descriptor buffer allocated may burn up to 16B of + * extra space to get the right alignment. (Technically, it's at most 28B + * because we're always going to start at least 4B aligned but we're being + * conservative here.) Allocate enough extra space that we can chop it + * into maxSets pieces and align each one of them to 32B. + */ + bo_size += HK_MIN_UBO_ALIGNMENT * pCreateInfo->maxSets; + + if (bo_size) { + pool->bo = agx_bo_create(&dev->dev, bo_size, 0, "Descriptor pool"); + if (!pool->bo) { + hk_destroy_descriptor_pool(dev, pAllocator, pool); + return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + + pool->mapped_ptr = pool->bo->ptr.cpu; + + /* The BO may be larger thanks to GPU page alignment. We may as well + * make that extra space available to the client. + */ + assert(pool->bo->size >= bo_size); + util_vma_heap_init(&pool->heap, pool->bo->ptr.gpu, pool->bo->size); + } else { + util_vma_heap_init(&pool->heap, 0, 0); + } + + *pDescriptorPool = hk_descriptor_pool_to_handle(pool); + return VK_SUCCESS; +} + +static VkResult +hk_descriptor_pool_alloc(struct hk_descriptor_pool *pool, uint64_t size, + uint64_t alignment, uint64_t *addr_out, void **map_out) +{ + assert(size > 0); + uint64_t addr = util_vma_heap_alloc(&pool->heap, size, alignment); + if (addr == 0) + return VK_ERROR_OUT_OF_POOL_MEMORY; + + assert(addr >= pool->bo->ptr.gpu); + assert(addr + size <= pool->bo->ptr.gpu + pool->bo->size); + uint64_t offset = addr - pool->bo->ptr.gpu; + + *addr_out = addr; + *map_out = pool->mapped_ptr + offset; + + return VK_SUCCESS; +} + +static void +hk_descriptor_pool_free(struct hk_descriptor_pool *pool, uint64_t addr, + uint64_t size) +{ + assert(size > 0); + assert(addr >= pool->bo->ptr.gpu); + assert(addr + size <= pool->bo->ptr.gpu + pool->bo->size); + util_vma_heap_free(&pool->heap, addr, size); +} + +static VkResult +hk_descriptor_set_create(struct hk_device *dev, struct hk_descriptor_pool *pool, + struct hk_descriptor_set_layout *layout, + uint32_t variable_count, + struct hk_descriptor_set **out_set) +{ + struct hk_descriptor_set *set; + VkResult result; + + uint32_t mem_size = + sizeof(struct hk_descriptor_set) + + layout->dynamic_buffer_count * sizeof(struct hk_buffer_address); + + set = + vk_object_zalloc(&dev->vk, NULL, mem_size, VK_OBJECT_TYPE_DESCRIPTOR_SET); + if (!set) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + set->size = layout->non_variable_descriptor_buffer_size; + + if (layout->binding_count > 0 && + (layout->binding[layout->binding_count - 1].flags & + VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) { + uint32_t stride = layout->binding[layout->binding_count - 1].stride; + set->size += stride * variable_count; + } + + set->size = align64(set->size, HK_MIN_UBO_ALIGNMENT); + + if (set->size > 0) { + result = hk_descriptor_pool_alloc(pool, set->size, HK_MIN_UBO_ALIGNMENT, + &set->addr, &set->mapped_ptr); + if (result != VK_SUCCESS) { + vk_object_free(&dev->vk, NULL, set); + return result; + } + } + + vk_descriptor_set_layout_ref(&layout->vk); + set->layout = layout; + + for (uint32_t b = 0; b < layout->binding_count; b++) { + if (layout->binding[b].type != VK_DESCRIPTOR_TYPE_SAMPLER && + layout->binding[b].type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) + continue; + + if (layout->binding[b].immutable_samplers == NULL) + continue; + + uint32_t array_size = layout->binding[b].array_size; + if (layout->binding[b].flags & + VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT) + array_size = variable_count; + + for (uint32_t j = 0; j < array_size; j++) { + write_sampled_image_view_desc(set, NULL, b, j, + layout->binding[b].type); + } + } + + list_addtail(&set->link, &pool->sets); + *out_set = set; + + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_AllocateDescriptorSets(VkDevice device, + const VkDescriptorSetAllocateInfo *pAllocateInfo, + VkDescriptorSet *pDescriptorSets) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_descriptor_pool, pool, pAllocateInfo->descriptorPool); + + VkResult result = VK_SUCCESS; + uint32_t i; + + struct hk_descriptor_set *set = NULL; + + const VkDescriptorSetVariableDescriptorCountAllocateInfo *var_desc_count = + vk_find_struct_const( + pAllocateInfo->pNext, + DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO); + + /* allocate a set of buffers for each shader to contain descriptors */ + for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) { + VK_FROM_HANDLE(hk_descriptor_set_layout, layout, + pAllocateInfo->pSetLayouts[i]); + /* If descriptorSetCount is zero or this structure is not included in + * the pNext chain, then the variable lengths are considered to be zero. + */ + const uint32_t variable_count = + var_desc_count && var_desc_count->descriptorSetCount > 0 + ? var_desc_count->pDescriptorCounts[i] + : 0; + + result = + hk_descriptor_set_create(dev, pool, layout, variable_count, &set); + if (result != VK_SUCCESS) + break; + + pDescriptorSets[i] = hk_descriptor_set_to_handle(set); + } + + if (result != VK_SUCCESS) { + hk_FreeDescriptorSets(device, pAllocateInfo->descriptorPool, i, + pDescriptorSets); + for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) { + pDescriptorSets[i] = VK_NULL_HANDLE; + } + } + return result; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_FreeDescriptorSets(VkDevice device, VkDescriptorPool descriptorPool, + uint32_t descriptorSetCount, + const VkDescriptorSet *pDescriptorSets) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_descriptor_pool, pool, descriptorPool); + + for (uint32_t i = 0; i < descriptorSetCount; i++) { + VK_FROM_HANDLE(hk_descriptor_set, set, pDescriptorSets[i]); + + if (set) + hk_descriptor_set_destroy(dev, pool, set); + } + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +hk_DestroyDescriptorPool(VkDevice device, VkDescriptorPool _pool, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_descriptor_pool, pool, _pool); + + if (!_pool) + return; + + hk_destroy_descriptor_pool(dev, pAllocator, pool); +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_ResetDescriptorPool(VkDevice device, VkDescriptorPool descriptorPool, + VkDescriptorPoolResetFlags flags) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_descriptor_pool, pool, descriptorPool); + + list_for_each_entry_safe(struct hk_descriptor_set, set, &pool->sets, link) + hk_descriptor_set_destroy(dev, pool, set); + + return VK_SUCCESS; +} + +static void +hk_descriptor_set_write_template( + struct hk_descriptor_set *set, + const struct vk_descriptor_update_template *template, const void *data) +{ + for (uint32_t i = 0; i < template->entry_count; i++) { + const struct vk_descriptor_template_entry *entry = &template->entries[i]; + + switch (entry->type) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + for (uint32_t j = 0; j < entry->array_count; j++) { + const VkDescriptorImageInfo *info = + data + entry->offset + j * entry->stride; + + write_sampled_image_view_desc(set, info, entry->binding, + entry->array_element + j, + entry->type); + } + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + for (uint32_t j = 0; j < entry->array_count; j++) { + const VkDescriptorImageInfo *info = + data + entry->offset + j * entry->stride; + + write_storage_image_view_desc(set, info, entry->binding, + entry->array_element + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + for (uint32_t j = 0; j < entry->array_count; j++) { + const VkBufferView *bview = + data + entry->offset + j * entry->stride; + + write_buffer_view_desc(set, *bview, entry->binding, + entry->array_element + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + for (uint32_t j = 0; j < entry->array_count; j++) { + const VkDescriptorBufferInfo *info = + data + entry->offset + j * entry->stride; + + write_buffer_desc(set, info, entry->binding, + entry->array_element + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + for (uint32_t j = 0; j < entry->array_count; j++) { + const VkDescriptorBufferInfo *info = + data + entry->offset + j * entry->stride; + + write_dynamic_buffer_desc(set, info, entry->binding, + entry->array_element + j); + } + break; + + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: + write_desc(set, entry->binding, entry->array_element, + data + entry->offset, entry->array_count); + break; + + default: + break; + } + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_UpdateDescriptorSetWithTemplate( + VkDevice device, VkDescriptorSet descriptorSet, + VkDescriptorUpdateTemplate descriptorUpdateTemplate, const void *pData) +{ + VK_FROM_HANDLE(hk_descriptor_set, set, descriptorSet); + VK_FROM_HANDLE(vk_descriptor_update_template, template, + descriptorUpdateTemplate); + + hk_descriptor_set_write_template(set, template, pData); +} + +void +hk_push_descriptor_set_update_template( + struct hk_push_descriptor_set *push_set, + struct hk_descriptor_set_layout *layout, + const struct vk_descriptor_update_template *template, const void *data) +{ + struct hk_descriptor_set tmp_set = { + .layout = layout, + .size = sizeof(push_set->data), + .mapped_ptr = push_set->data, + }; + hk_descriptor_set_write_template(&tmp_set, template, data); +} diff --git a/src/asahi/vulkan/hk_descriptor_set.h b/src/asahi/vulkan/hk_descriptor_set.h new file mode 100644 index 00000000000..88606654df2 --- /dev/null +++ b/src/asahi/vulkan/hk_descriptor_set.h @@ -0,0 +1,107 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "hk_private.h" + +#include "hk_device.h" +#include "vk_descriptor_update_template.h" +#include "vk_object.h" + +#include "util/list.h" +#include "util/vma.h" + +/* Stride of the image heap, equal to the size of a texture/PBE descriptor */ +#define HK_IMAGE_STRIDE (24) + +struct hk_descriptor_set_layout; + +struct hk_sampled_image_descriptor { + uint32_t image_offset; + uint16_t sampler_index; + uint16_t lod_bias_fp16; + /* TODO: This should probably be a heap! */ + uint32_t border[4]; + /* XXX: Single bit! Tuck it in somewhere else */ + uint32_t has_border; + uint16_t clamp_0_sampler_index; + uint16_t pad_0; +}; +static_assert(sizeof(struct hk_sampled_image_descriptor) == 32, + "hk_sampled_image_descriptor has no holes"); + +struct hk_storage_image_descriptor { + uint32_t tex_offset; + uint32_t pbe_offset; +}; +static_assert(sizeof(struct hk_storage_image_descriptor) == 8, + "hk_storage_image_descriptor has no holes"); + +struct hk_buffer_view_descriptor { + uint32_t tex_offset; + uint32_t pbe_offset; +}; +static_assert(sizeof(struct hk_buffer_view_descriptor) == 8, + "hk_buffer_view_descriptor has no holes"); + +/* This has to match nir_address_format_64bit_bounded_global */ +struct hk_buffer_address { + uint64_t base_addr; + uint32_t size; + uint32_t zero; /* Must be zero! */ +}; + +struct hk_descriptor_pool { + struct vk_object_base base; + + struct list_head sets; + + struct agx_bo *bo; + uint8_t *mapped_ptr; + struct util_vma_heap heap; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_descriptor_pool, base, VkDescriptorPool, + VK_OBJECT_TYPE_DESCRIPTOR_POOL) + +struct hk_descriptor_set { + struct vk_object_base base; + + /* Link in hk_descriptor_pool::sets */ + struct list_head link; + + struct hk_descriptor_set_layout *layout; + void *mapped_ptr; + uint64_t addr; + uint32_t size; + + struct hk_buffer_address dynamic_buffers[]; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_descriptor_set, base, VkDescriptorSet, + VK_OBJECT_TYPE_DESCRIPTOR_SET) + +static inline uint64_t +hk_descriptor_set_addr(const struct hk_descriptor_set *set) +{ + return set->addr; +} + +struct hk_push_descriptor_set { + uint8_t data[HK_PUSH_DESCRIPTOR_SET_SIZE]; +}; + +void hk_push_descriptor_set_update(struct hk_push_descriptor_set *push_set, + struct hk_descriptor_set_layout *layout, + uint32_t write_count, + const VkWriteDescriptorSet *writes); + +void hk_push_descriptor_set_update_template( + struct hk_push_descriptor_set *push_set, + struct hk_descriptor_set_layout *layout, + const struct vk_descriptor_update_template *template, const void *data); diff --git a/src/asahi/vulkan/hk_descriptor_set_layout.c b/src/asahi/vulkan/hk_descriptor_set_layout.c new file mode 100644 index 00000000000..7efe2e127a6 --- /dev/null +++ b/src/asahi/vulkan/hk_descriptor_set_layout.c @@ -0,0 +1,423 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_descriptor_set_layout.h" + +#include "hk_descriptor_set.h" +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_physical_device.h" +#include "hk_sampler.h" + +#include "vk_pipeline_layout.h" + +static bool +binding_has_immutable_samplers(const VkDescriptorSetLayoutBinding *binding) +{ + switch (binding->descriptorType) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + return binding->pImmutableSamplers != NULL; + + default: + return false; + } +} + +void +hk_descriptor_stride_align_for_type( + const struct hk_physical_device *pdev, VkDescriptorType type, + const VkMutableDescriptorTypeListEXT *type_list, uint32_t *stride, + uint32_t *alignment) +{ + switch (type) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + /* TODO: How do samplers work? */ + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + *stride = *alignment = sizeof(struct hk_sampled_image_descriptor); + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + *stride = *alignment = sizeof(struct hk_storage_image_descriptor); + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + *stride = *alignment = sizeof(struct hk_buffer_view_descriptor); + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + *stride = *alignment = sizeof(struct hk_buffer_address); + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + *stride = *alignment = 0; /* These don't take up buffer space */ + break; + + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: + *stride = 1; /* Array size is bytes */ + *alignment = HK_MIN_UBO_ALIGNMENT; + break; + + case VK_DESCRIPTOR_TYPE_MUTABLE_EXT: + *stride = *alignment = 0; + if (type_list == NULL) + *stride = *alignment = HK_MAX_DESCRIPTOR_SIZE; + for (unsigned i = 0; type_list && i < type_list->descriptorTypeCount; + i++) { + /* This shouldn't recurse */ + assert(type_list->pDescriptorTypes[i] != + VK_DESCRIPTOR_TYPE_MUTABLE_EXT); + uint32_t desc_stride, desc_align; + hk_descriptor_stride_align_for_type(pdev, + type_list->pDescriptorTypes[i], + NULL, &desc_stride, &desc_align); + *stride = MAX2(*stride, desc_stride); + *alignment = MAX2(*alignment, desc_align); + } + *stride = ALIGN(*stride, *alignment); + break; + + default: + unreachable("Invalid descriptor type"); + } + + assert(*stride <= HK_MAX_DESCRIPTOR_SIZE); +} + +static const VkMutableDescriptorTypeListEXT * +hk_descriptor_get_type_list(VkDescriptorType type, + const VkMutableDescriptorTypeCreateInfoEXT *info, + const uint32_t info_idx) +{ + const VkMutableDescriptorTypeListEXT *type_list = NULL; + if (type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) { + assert(info != NULL); + assert(info_idx < info->mutableDescriptorTypeListCount); + type_list = &info->pMutableDescriptorTypeLists[info_idx]; + } + return type_list; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateDescriptorSetLayout(VkDevice device, + const VkDescriptorSetLayoutCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkDescriptorSetLayout *pSetLayout) +{ + VK_FROM_HANDLE(hk_device, dev, device); + struct hk_physical_device *pdev = hk_device_physical(dev); + + uint32_t num_bindings = 0; + uint32_t immutable_sampler_count = 0; + for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { + const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j]; + num_bindings = MAX2(num_bindings, binding->binding + 1); + + /* From the Vulkan 1.1.97 spec for VkDescriptorSetLayoutBinding: + * + * "If descriptorType specifies a VK_DESCRIPTOR_TYPE_SAMPLER or + * VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER type descriptor, then + * pImmutableSamplers can be used to initialize a set of immutable + * samplers. [...] If descriptorType is not one of these descriptor + * types, then pImmutableSamplers is ignored. + * + * We need to be careful here and only parse pImmutableSamplers if we + * have one of the right descriptor types. + */ + if (binding_has_immutable_samplers(binding)) + immutable_sampler_count += binding->descriptorCount; + } + + VK_MULTIALLOC(ma); + VK_MULTIALLOC_DECL(&ma, struct hk_descriptor_set_layout, layout, 1); + VK_MULTIALLOC_DECL(&ma, struct hk_descriptor_set_binding_layout, bindings, + num_bindings); + VK_MULTIALLOC_DECL(&ma, struct hk_sampler *, samplers, + immutable_sampler_count); + + if (!vk_descriptor_set_layout_multizalloc(&dev->vk, &ma)) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + layout->binding_count = num_bindings; + + for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { + const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j]; + uint32_t b = binding->binding; + /* We temporarily store pCreateInfo->pBindings[] index (plus one) in the + * immutable_samplers pointer. This provides us with a quick-and-dirty + * way to sort the bindings by binding number. + */ + layout->binding[b].immutable_samplers = (void *)(uintptr_t)(j + 1); + } + + const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags_info = + vk_find_struct_const(pCreateInfo->pNext, + DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO); + const VkMutableDescriptorTypeCreateInfoEXT *mutable_info = + vk_find_struct_const(pCreateInfo->pNext, + MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT); + + uint32_t buffer_size = 0; + uint8_t dynamic_buffer_count = 0; + for (uint32_t b = 0; b < num_bindings; b++) { + /* We stashed the pCreateInfo->pBindings[] index (plus one) in the + * immutable_samplers pointer. Check for NULL (empty binding) and then + * reset it and compute the index. + */ + if (layout->binding[b].immutable_samplers == NULL) + continue; + const uint32_t info_idx = + (uintptr_t)(void *)layout->binding[b].immutable_samplers - 1; + layout->binding[b].immutable_samplers = NULL; + + const VkDescriptorSetLayoutBinding *binding = + &pCreateInfo->pBindings[info_idx]; + + if (binding->descriptorCount == 0) + continue; + + layout->binding[b].type = binding->descriptorType; + + if (binding_flags_info && binding_flags_info->bindingCount > 0) { + assert(binding_flags_info->bindingCount == pCreateInfo->bindingCount); + layout->binding[b].flags = binding_flags_info->pBindingFlags[info_idx]; + } + + layout->binding[b].array_size = binding->descriptorCount; + + switch (binding->descriptorType) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + layout->binding[b].dynamic_buffer_index = dynamic_buffer_count; + dynamic_buffer_count += binding->descriptorCount; + break; + default: + break; + } + + const VkMutableDescriptorTypeListEXT *type_list = + hk_descriptor_get_type_list(binding->descriptorType, mutable_info, + info_idx); + + uint32_t stride, alignment; + hk_descriptor_stride_align_for_type(pdev, binding->descriptorType, + type_list, &stride, &alignment); + + uint8_t max_plane_count = 1; + + if (binding_has_immutable_samplers(binding)) { + layout->binding[b].immutable_samplers = samplers; + samplers += binding->descriptorCount; + for (uint32_t i = 0; i < binding->descriptorCount; i++) { + VK_FROM_HANDLE(hk_sampler, sampler, binding->pImmutableSamplers[i]); + layout->binding[b].immutable_samplers[i] = sampler; + const uint8_t sampler_plane_count = + sampler->vk.ycbcr_conversion + ? vk_format_get_plane_count( + sampler->vk.ycbcr_conversion->state.format) + : 1; + if (max_plane_count < sampler_plane_count) + max_plane_count = sampler_plane_count; + } + } + + stride *= max_plane_count; + + if (stride > 0) { + assert(stride <= UINT8_MAX); + assert(util_is_power_of_two_nonzero(alignment)); + + buffer_size = align64(buffer_size, alignment); + layout->binding[b].offset = buffer_size; + layout->binding[b].stride = stride; + + if (layout->binding[b].flags & + VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT) { + /* From the Vulkan 1.3.256 spec: + * + * VUID-VkDescriptorSetLayoutBindingFlagsCreateInfo-pBindingFlags-03004 + * "If an element of pBindingFlags includes + * VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT, then + * all other elements of + * VkDescriptorSetLayoutCreateInfo::pBindings must have a + * smaller value of binding" + * + * In other words, it has to be the last binding. + */ + assert(b == num_bindings - 1); + } else { + /* the allocation size will be computed at descriptor allocation, + * but the buffer size will be already aligned as this binding will + * be the last + */ + buffer_size += stride * binding->descriptorCount; + } + } + } + + layout->non_variable_descriptor_buffer_size = buffer_size; + layout->dynamic_buffer_count = dynamic_buffer_count; + + struct mesa_blake3 blake3_ctx; + _mesa_blake3_init(&blake3_ctx); + +#define BLAKE3_UPDATE_VALUE(x) \ + _mesa_blake3_update(&blake3_ctx, &(x), sizeof(x)); + BLAKE3_UPDATE_VALUE(layout->non_variable_descriptor_buffer_size); + BLAKE3_UPDATE_VALUE(layout->dynamic_buffer_count); + BLAKE3_UPDATE_VALUE(layout->binding_count); + + for (uint32_t b = 0; b < num_bindings; b++) { + BLAKE3_UPDATE_VALUE(layout->binding[b].type); + BLAKE3_UPDATE_VALUE(layout->binding[b].flags); + BLAKE3_UPDATE_VALUE(layout->binding[b].array_size); + BLAKE3_UPDATE_VALUE(layout->binding[b].offset); + BLAKE3_UPDATE_VALUE(layout->binding[b].stride); + BLAKE3_UPDATE_VALUE(layout->binding[b].dynamic_buffer_index); + + if (layout->binding[b].immutable_samplers != NULL) { + for (uint32_t i = 0; i < layout->binding[b].array_size; i++) { + const struct hk_sampler *sampler = + layout->binding[b].immutable_samplers[i]; + + /* We zalloc the object, so it's safe to hash the whole thing */ + if (sampler != NULL && sampler->vk.ycbcr_conversion != NULL) + BLAKE3_UPDATE_VALUE(sampler->vk.ycbcr_conversion->state); + } + } + } +#undef BLAKE3_UPDATE_VALUE + + _mesa_blake3_final(&blake3_ctx, layout->vk.blake3); + + *pSetLayout = hk_descriptor_set_layout_to_handle(layout); + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetDescriptorSetLayoutSupport( + VkDevice device, const VkDescriptorSetLayoutCreateInfo *pCreateInfo, + VkDescriptorSetLayoutSupport *pSupport) +{ + VK_FROM_HANDLE(hk_device, dev, device); + struct hk_physical_device *pdev = hk_device_physical(dev); + + const VkMutableDescriptorTypeCreateInfoEXT *mutable_info = + vk_find_struct_const(pCreateInfo->pNext, + MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT); + const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags = + vk_find_struct_const(pCreateInfo->pNext, + DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO); + + /* Figure out the maximum alignment up-front. Otherwise, we need to sort + * the list of descriptors by binding number in order to get the size + * accumulation right. + */ + uint32_t max_align = 0; + for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) { + const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[i]; + const VkMutableDescriptorTypeListEXT *type_list = + hk_descriptor_get_type_list(binding->descriptorType, mutable_info, i); + + uint32_t stride, alignment; + hk_descriptor_stride_align_for_type(pdev, binding->descriptorType, + type_list, &stride, &alignment); + max_align = MAX2(max_align, alignment); + } + + uint64_t non_variable_size = 0; + uint32_t variable_stride = 0; + uint32_t variable_count = 0; + uint8_t dynamic_buffer_count = 0; + + for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) { + const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[i]; + + VkDescriptorBindingFlags flags = 0; + if (binding_flags != NULL && binding_flags->bindingCount > 0) + flags = binding_flags->pBindingFlags[i]; + + switch (binding->descriptorType) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + dynamic_buffer_count += binding->descriptorCount; + break; + default: + break; + } + + const VkMutableDescriptorTypeListEXT *type_list = + hk_descriptor_get_type_list(binding->descriptorType, mutable_info, i); + + uint32_t stride, alignment; + hk_descriptor_stride_align_for_type(pdev, binding->descriptorType, + type_list, &stride, &alignment); + + if (stride > 0) { + assert(stride <= UINT8_MAX); + assert(util_is_power_of_two_nonzero(alignment)); + + if (flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT) { + /* From the Vulkan 1.3.256 spec: + * + * "For the purposes of this command, a variable-sized + * descriptor binding with a descriptorCount of zero is treated + * as if the descriptorCount is one" + */ + variable_count = MAX2(1, binding->descriptorCount); + variable_stride = stride; + } else { + /* Since we're aligning to the maximum and since this is just a + * check for whether or not the max buffer size is big enough, we + * keep non_variable_size aligned to max_align. + */ + non_variable_size += stride * binding->descriptorCount; + non_variable_size = align64(non_variable_size, max_align); + } + } + } + + uint64_t buffer_size = non_variable_size; + if (variable_stride > 0) { + buffer_size += variable_stride * variable_count; + buffer_size = align64(buffer_size, max_align); + } + + uint32_t max_buffer_size; + if (pCreateInfo->flags & + VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) + max_buffer_size = HK_PUSH_DESCRIPTOR_SET_SIZE; + else + max_buffer_size = HK_MAX_DESCRIPTOR_SET_SIZE; + + pSupport->supported = dynamic_buffer_count <= HK_MAX_DYNAMIC_BUFFERS && + buffer_size <= max_buffer_size; + + vk_foreach_struct(ext, pSupport->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT: { + VkDescriptorSetVariableDescriptorCountLayoutSupport *vs = (void *)ext; + if (variable_stride > 0) { + vs->maxVariableDescriptorCount = + (max_buffer_size - non_variable_size) / variable_stride; + } else { + vs->maxVariableDescriptorCount = 0; + } + break; + } + + default: + vk_debug_ignored_stype(ext->sType); + break; + } + } +} diff --git a/src/asahi/vulkan/hk_descriptor_set_layout.h b/src/asahi/vulkan/hk_descriptor_set_layout.h new file mode 100644 index 00000000000..a21a885a918 --- /dev/null +++ b/src/asahi/vulkan/hk_descriptor_set_layout.h @@ -0,0 +1,75 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "hk_private.h" + +#include "vk_descriptor_set_layout.h" +#include "vk_object.h" + +struct hk_device; +struct hk_physical_device; +struct hk_sampler; +struct vk_pipeline_layout; + +struct hk_descriptor_set_binding_layout { + /* The type of the descriptors in this binding */ + VkDescriptorType type; + + /* Flags provided when this binding was created */ + VkDescriptorBindingFlags flags; + + /* Number of array elements in this binding (or size in bytes for inline + * uniform data) + */ + uint32_t array_size; + + /* Offset into the descriptor buffer where this descriptor lives */ + uint32_t offset; + + /* Stride between array elements in the descriptor buffer */ + uint8_t stride; + + /* Index into the dynamic buffer binding array */ + uint8_t dynamic_buffer_index; + + /* Immutable samplers (or NULL if no immutable samplers) */ + struct hk_sampler **immutable_samplers; +}; + +struct hk_descriptor_set_layout { + struct vk_descriptor_set_layout vk; + + /* Size of the descriptor buffer for this descriptor set */ + /* Does not contain the size needed for variable count descriptors */ + uint32_t non_variable_descriptor_buffer_size; + + /* Number of dynamic UBO bindings in this set */ + uint8_t dynamic_buffer_count; + + /* Number of bindings in this descriptor set */ + uint32_t binding_count; + + /* Bindings in this descriptor set */ + struct hk_descriptor_set_binding_layout binding[0]; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_descriptor_set_layout, vk.base, + VkDescriptorSetLayout, + VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT) + +void hk_descriptor_stride_align_for_type( + const struct hk_physical_device *pdev, VkDescriptorType type, + const VkMutableDescriptorTypeListEXT *type_list, uint32_t *stride, + uint32_t *alignment); + +static inline struct hk_descriptor_set_layout * +vk_to_hk_descriptor_set_layout(struct vk_descriptor_set_layout *layout) +{ + return container_of(layout, struct hk_descriptor_set_layout, vk); +} diff --git a/src/asahi/vulkan/hk_descriptor_table.c b/src/asahi/vulkan/hk_descriptor_table.c new file mode 100644 index 00000000000..6d07ac6f384 --- /dev/null +++ b/src/asahi/vulkan/hk_descriptor_table.c @@ -0,0 +1,179 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_descriptor_table.h" + +#include "hk_device.h" +#include "hk_physical_device.h" + +#include "asahi/lib/agx_bo.h" +#include + +static VkResult +hk_descriptor_table_grow_locked(struct hk_device *dev, + struct hk_descriptor_table *table, + uint32_t new_alloc) +{ + struct agx_bo *new_bo; + uint32_t *new_free_table; + + assert(new_alloc > table->alloc && new_alloc <= table->max_alloc); + + const uint32_t new_bo_size = new_alloc * table->desc_size; + new_bo = agx_bo_create(&dev->dev, new_bo_size, 0, "Descriptor table"); + + if (new_bo == NULL) { + return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to allocate the descriptor table"); + } + + void *new_map = new_bo->ptr.cpu; + + assert(table->bo == NULL && "not yet implemented sparse binding"); + table->bo = new_bo; + table->map = new_map; + + const size_t new_free_table_size = new_alloc * sizeof(uint32_t); + new_free_table = + vk_realloc(&dev->vk.alloc, table->free_table, new_free_table_size, 4, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (new_free_table == NULL) { + return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY, + "Failed to allocate image descriptor free table"); + } + table->free_table = new_free_table; + + table->alloc = new_alloc; + + return VK_SUCCESS; +} + +VkResult +hk_descriptor_table_init(struct hk_device *dev, + struct hk_descriptor_table *table, + uint32_t descriptor_size, + uint32_t min_descriptor_count, + uint32_t max_descriptor_count) +{ + memset(table, 0, sizeof(*table)); + VkResult result; + + simple_mtx_init(&table->mutex, mtx_plain); + + assert(util_is_power_of_two_nonzero(min_descriptor_count)); + assert(util_is_power_of_two_nonzero(max_descriptor_count)); + + /* TODO: sparse binding for stable gpu va */ + min_descriptor_count = max_descriptor_count; + + table->desc_size = descriptor_size; + table->alloc = 0; + table->max_alloc = max_descriptor_count; + table->next_desc = 0; + table->free_count = 0; + + result = hk_descriptor_table_grow_locked(dev, table, min_descriptor_count); + if (result != VK_SUCCESS) { + hk_descriptor_table_finish(dev, table); + return result; + } + + return VK_SUCCESS; +} + +void +hk_descriptor_table_finish(struct hk_device *dev, + struct hk_descriptor_table *table) +{ + agx_bo_unreference(table->bo); + vk_free(&dev->vk.alloc, table->free_table); + simple_mtx_destroy(&table->mutex); +} + +#define HK_IMAGE_DESC_INVALID + +static VkResult +hk_descriptor_table_alloc_locked(struct hk_device *dev, + struct hk_descriptor_table *table, + uint32_t *index_out) +{ + VkResult result; + + if (table->free_count > 0) { + *index_out = table->free_table[--table->free_count]; + return VK_SUCCESS; + } + + if (table->next_desc < table->alloc) { + *index_out = table->next_desc++; + return VK_SUCCESS; + } + + if (table->next_desc >= table->max_alloc) { + return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY, + "Descriptor table not large enough"); + } + + result = hk_descriptor_table_grow_locked(dev, table, table->alloc * 2); + if (result != VK_SUCCESS) + return result; + + assert(table->next_desc < table->alloc); + *index_out = table->next_desc++; + + return VK_SUCCESS; +} + +static VkResult +hk_descriptor_table_add_locked(struct hk_device *dev, + struct hk_descriptor_table *table, + const void *desc_data, size_t desc_size, + uint32_t *index_out) +{ + VkResult result = hk_descriptor_table_alloc_locked(dev, table, index_out); + if (result != VK_SUCCESS) + return result; + + void *map = (char *)table->map + (*index_out * table->desc_size); + + assert(desc_size == table->desc_size); + memcpy(map, desc_data, table->desc_size); + + return VK_SUCCESS; +} + +VkResult +hk_descriptor_table_add(struct hk_device *dev, + struct hk_descriptor_table *table, + const void *desc_data, size_t desc_size, + uint32_t *index_out) +{ + simple_mtx_lock(&table->mutex); + VkResult result = hk_descriptor_table_add_locked(dev, table, desc_data, + desc_size, index_out); + simple_mtx_unlock(&table->mutex); + + return result; +} + +void +hk_descriptor_table_remove(struct hk_device *dev, + struct hk_descriptor_table *table, uint32_t index) +{ + simple_mtx_lock(&table->mutex); + + void *map = (char *)table->map + (index * table->desc_size); + memset(map, 0, table->desc_size); + + /* Sanity check for double-free */ + assert(table->free_count < table->alloc); + for (uint32_t i = 0; i < table->free_count; i++) + assert(table->free_table[i] != index); + + table->free_table[table->free_count++] = index; + + simple_mtx_unlock(&table->mutex); +} diff --git a/src/asahi/vulkan/hk_descriptor_table.h b/src/asahi/vulkan/hk_descriptor_table.h new file mode 100644 index 00000000000..759bcf8a4b5 --- /dev/null +++ b/src/asahi/vulkan/hk_descriptor_table.h @@ -0,0 +1,49 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "hk_private.h" + +#include "asahi/lib/agx_bo.h" +#include "util/simple_mtx.h" + +struct hk_device; + +struct hk_descriptor_table { + simple_mtx_t mutex; + + uint32_t desc_size; /**< Size of a descriptor */ + uint32_t alloc; /**< Number of descriptors allocated */ + uint32_t max_alloc; /**< Maximum possible number of descriptors */ + uint32_t next_desc; /**< Next unallocated descriptor */ + uint32_t free_count; /**< Size of free_table */ + + struct agx_bo *bo; + void *map; + + /* Stack for free descriptor elements */ + uint32_t *free_table; +}; + +VkResult hk_descriptor_table_init(struct hk_device *dev, + struct hk_descriptor_table *table, + uint32_t descriptor_size, + uint32_t min_descriptor_count, + uint32_t max_descriptor_count); + +void hk_descriptor_table_finish(struct hk_device *dev, + struct hk_descriptor_table *table); + +VkResult hk_descriptor_table_add(struct hk_device *dev, + struct hk_descriptor_table *table, + const void *desc_data, size_t desc_size, + uint32_t *index_out); + +void hk_descriptor_table_remove(struct hk_device *dev, + struct hk_descriptor_table *table, + uint32_t index); diff --git a/src/asahi/vulkan/hk_device.c b/src/asahi/vulkan/hk_device.c new file mode 100644 index 00000000000..f5c4535aca2 --- /dev/null +++ b/src/asahi/vulkan/hk_device.c @@ -0,0 +1,548 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_device.h" + +#include "agx_bg_eot.h" +#include "agx_opcodes.h" +#include "agx_scratch.h" +#include "hk_cmd_buffer.h" +#include "hk_descriptor_table.h" +#include "hk_entrypoints.h" +#include "hk_instance.h" +#include "hk_physical_device.h" +#include "hk_shader.h" + +#include "asahi/genxml/agx_pack.h" +#include "asahi/lib/agx_bo.h" +#include "asahi/lib/agx_device.h" +#include "asahi/lib/shaders/geometry.h" +#include "util/hash_table.h" +#include "util/os_file.h" +#include "util/ralloc.h" +#include "util/simple_mtx.h" +#include "vulkan/vulkan_core.h" +#include "vulkan/wsi/wsi_common.h" +#include "vk_cmd_enqueue_entrypoints.h" +#include "vk_common_entrypoints.h" +#include "vk_pipeline_cache.h" + +#include +#include + +/* + * We preupload some constants so we can cheaply reference later without extra + * allocation and copying. + * + * TODO: This is small, don't waste a whole BO. + */ +static VkResult +hk_upload_rodata(struct hk_device *dev) +{ + dev->rodata.bo = + agx_bo_create(&dev->dev, AGX_SAMPLER_LENGTH, 0, "Read only data"); + + if (!dev->rodata.bo) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + uint8_t *map = dev->rodata.bo->ptr.cpu; + uint32_t offs = 0; + + offs = align(offs, 8); + agx_pack(&dev->rodata.txf_sampler, USC_SAMPLER, cfg) { + cfg.start = 0; + cfg.count = 1; + cfg.buffer = dev->rodata.bo->ptr.gpu + offs; + } + + agx_pack(map + offs, SAMPLER, cfg) { + /* Allow mipmapping. This is respected by txf, weirdly. */ + cfg.mip_filter = AGX_MIP_FILTER_NEAREST; + + /* Out-of-bounds reads must return 0 */ + cfg.wrap_s = AGX_WRAP_CLAMP_TO_BORDER; + cfg.wrap_t = AGX_WRAP_CLAMP_TO_BORDER; + cfg.wrap_r = AGX_WRAP_CLAMP_TO_BORDER; + cfg.border_colour = AGX_BORDER_COLOUR_TRANSPARENT_BLACK; + } + offs += AGX_SAMPLER_LENGTH; + + /* The image heap is allocated on the device prior to the rodata. The heap + * lives as long as the device does and has a stable address (requiring + * sparse binding to grow dynamically). That means its address is effectively + * rodata and can be uploaded now. agx_usc_uniform requires an indirection to + * push the heap address, so this takes care of that indirection up front to + * cut an alloc/upload at draw time. + */ + offs = align(offs, sizeof(uint64_t)); + agx_pack(&dev->rodata.image_heap, USC_UNIFORM, cfg) { + cfg.start_halfs = HK_IMAGE_HEAP_UNIFORM; + cfg.size_halfs = 4; + cfg.buffer = dev->rodata.bo->ptr.gpu + offs; + } + + uint64_t *image_heap_ptr = dev->rodata.bo->ptr.cpu + offs; + *image_heap_ptr = dev->images.bo->ptr.gpu; + offs += sizeof(uint64_t); + + /* The geometry state buffer isn't strictly readonly data, but we only have a + * single instance of it device-wide and -- after initializing at heap + * allocate time -- it is read-only from the CPU perspective. The GPU uses it + * for scratch, but is required to reset it after use to ensure resubmitting + * the same command buffer works. + * + * So, we allocate it here for convenience. + */ + offs = align(offs, sizeof(uint64_t)); + dev->rodata.geometry_state = dev->rodata.bo->ptr.gpu + offs; + offs += sizeof(struct agx_geometry_state); + + /* For null readonly buffers, we need to allocate 16 bytes of zeroes for + * robustness2 semantics on read. + */ + offs = align(offs, 16); + dev->rodata.zero_sink = dev->rodata.bo->ptr.gpu + offs; + memset(dev->rodata.bo->ptr.cpu + offs, 0, 16); + offs += 16; + + /* For null storage descriptors, we need to reserve 16 bytes to catch writes. + * No particular content is required; we cannot get robustness2 semantics + * without more work. + */ + offs = align(offs, 16); + dev->rodata.null_sink = dev->rodata.bo->ptr.gpu + offs; + offs += 16; + + return VK_SUCCESS; +} + +static uint32_t +internal_key_hash(const void *key_) +{ + const struct hk_internal_key *key = key_; + + return _mesa_hash_data(key, sizeof(struct hk_internal_key) + key->key_size); +} + +static bool +internal_key_equal(const void *a_, const void *b_) +{ + const struct hk_internal_key *a = a_; + const struct hk_internal_key *b = b_; + + return a->builder == b->builder && a->key_size == b->key_size && + memcmp(a->key, b->key, a->key_size) == 0; +} + +static VkResult +hk_init_internal_shaders(struct hk_internal_shaders *s) +{ + s->ht = _mesa_hash_table_create(NULL, internal_key_hash, internal_key_equal); + if (!s->ht) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + simple_mtx_init(&s->lock, mtx_plain); + return VK_SUCCESS; +} + +static void +hk_destroy_internal_shaders(struct hk_device *dev, + struct hk_internal_shaders *s, bool part) +{ + hash_table_foreach(s->ht, ent) { + if (part) { + struct agx_shader_part *part = ent->data; + free(part->binary); + + /* The agx_shader_part itself is ralloc'd against the hash table so + * will be freed. + */ + } else { + struct hk_api_shader *obj = ent->data; + hk_api_shader_destroy(&dev->vk, &obj->vk, NULL); + } + } + + _mesa_hash_table_destroy(s->ht, NULL); + simple_mtx_destroy(&s->lock); +} + +DERIVE_HASH_TABLE(agx_sampler_packed); + +static VkResult +hk_init_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h) +{ + h->ht = agx_sampler_packed_table_create(NULL); + if (!h->ht) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + VkResult result = + hk_descriptor_table_init(dev, &h->table, AGX_SAMPLER_LENGTH, 1024, 1024); + + if (result != VK_SUCCESS) { + ralloc_free(h->ht); + return result; + } + + simple_mtx_init(&h->lock, mtx_plain); + return VK_SUCCESS; +} + +static void +hk_destroy_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h) +{ + hk_descriptor_table_finish(dev, &h->table); + ralloc_free(h->ht); + simple_mtx_destroy(&h->lock); +} + +static VkResult +hk_sampler_heap_add_locked(struct hk_device *dev, struct hk_sampler_heap *h, + struct agx_sampler_packed desc, + struct hk_rc_sampler **out) +{ + struct hash_entry *ent = _mesa_hash_table_search(h->ht, &desc); + if (ent != NULL) { + *out = ent->data; + + assert((*out)->refcount != 0); + (*out)->refcount++; + + return VK_SUCCESS; + } + + struct hk_rc_sampler *rc = ralloc(h->ht, struct hk_rc_sampler); + if (!rc) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + uint32_t index; + VkResult result = + hk_descriptor_table_add(dev, &h->table, &desc, sizeof(desc), &index); + if (result != VK_SUCCESS) { + ralloc_free(rc); + return result; + } + + *rc = (struct hk_rc_sampler){ + .key = desc, + .refcount = 1, + .index = index, + }; + + _mesa_hash_table_insert(h->ht, &rc->key, rc); + *out = rc; + + return VK_SUCCESS; +} + +VkResult +hk_sampler_heap_add(struct hk_device *dev, struct agx_sampler_packed desc, + struct hk_rc_sampler **out) +{ + struct hk_sampler_heap *h = &dev->samplers; + + simple_mtx_lock(&h->lock); + VkResult result = hk_sampler_heap_add_locked(dev, h, desc, out); + simple_mtx_unlock(&h->lock); + + return result; +} + +static void +hk_sampler_heap_remove_locked(struct hk_device *dev, struct hk_sampler_heap *h, + struct hk_rc_sampler *rc) +{ + assert(rc->refcount != 0); + rc->refcount--; + + if (rc->refcount == 0) { + hk_descriptor_table_remove(dev, &h->table, rc->index); + _mesa_hash_table_remove_key(h->ht, &rc->key); + ralloc_free(rc); + } +} + +void +hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc) +{ + struct hk_sampler_heap *h = &dev->samplers; + + simple_mtx_lock(&h->lock); + hk_sampler_heap_remove_locked(dev, h, rc); + simple_mtx_unlock(&h->lock); +} + +/* + * To implement nullDescriptor, the descriptor set code will reference + * preuploaded null descriptors at fixed offsets in the image heap. Here we + * upload those descriptors, initializing the image heap. + */ +static void +hk_upload_null_descriptors(struct hk_device *dev) +{ + struct agx_texture_packed null_tex; + struct agx_pbe_packed null_pbe; + uint32_t offset_tex, offset_pbe; + + agx_set_null_texture(&null_tex, dev->rodata.null_sink); + agx_set_null_pbe(&null_pbe, dev->rodata.null_sink); + + hk_descriptor_table_add(dev, &dev->images, &null_tex, sizeof(null_tex), + &offset_tex); + + hk_descriptor_table_add(dev, &dev->images, &null_pbe, sizeof(null_pbe), + &offset_pbe); + + assert((offset_tex * HK_IMAGE_STRIDE) == HK_NULL_TEX_OFFSET && "static"); + assert((offset_pbe * HK_IMAGE_STRIDE) == HK_NULL_PBE_OFFSET && "static"); +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateDevice(VkPhysicalDevice physicalDevice, + const VkDeviceCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, VkDevice *pDevice) +{ + VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice); + VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY; + struct hk_device *dev; + + dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator, sizeof(*dev), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!dev) + return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY); + + struct vk_device_dispatch_table dispatch_table; + + /* For secondary command buffer support, overwrite any command entrypoints + * in the main device-level dispatch table with + * vk_cmd_enqueue_unless_primary_Cmd*. + */ + vk_device_dispatch_table_from_entrypoints( + &dispatch_table, &vk_cmd_enqueue_unless_primary_device_entrypoints, true); + + vk_device_dispatch_table_from_entrypoints(&dispatch_table, + &hk_device_entrypoints, false); + vk_device_dispatch_table_from_entrypoints(&dispatch_table, + &wsi_device_entrypoints, false); + + /* Populate primary cmd_dispatch table */ + vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch, + &hk_device_entrypoints, true); + vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch, + &wsi_device_entrypoints, false); + vk_device_dispatch_table_from_entrypoints( + &dev->cmd_dispatch, &vk_common_device_entrypoints, false); + + result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table, pCreateInfo, + pAllocator); + if (result != VK_SUCCESS) + goto fail_alloc; + + dev->vk.shader_ops = &hk_device_shader_ops; + dev->vk.command_dispatch_table = &dev->cmd_dispatch; + + drmDevicePtr drm_device = NULL; + int ret = drmGetDeviceFromDevId(pdev->render_dev, 0, &drm_device); + if (ret != 0) { + result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED, + "Failed to get DRM device: %m"); + goto fail_init; + } + + const char *path = drm_device->nodes[DRM_NODE_RENDER]; + dev->dev.fd = open(path, O_RDWR | O_CLOEXEC); + if (dev->dev.fd < 0) { + drmFreeDevice(&drm_device); + result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED, + "failed to open device %s", path); + goto fail_init; + } + + bool succ = agx_open_device(NULL, &dev->dev); + drmFreeDevice(&drm_device); + if (!succ) { + result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED, + "Failed to get DRM device: %m"); + goto fail_fd; + } + + vk_device_set_drm_fd(&dev->vk, dev->dev.fd); + dev->vk.command_buffer_ops = &hk_cmd_buffer_ops; + + result = hk_descriptor_table_init(dev, &dev->images, AGX_TEXTURE_LENGTH, + 1024, 1024 * 1024); + if (result != VK_SUCCESS) + goto fail_dev; + + result = hk_init_sampler_heap(dev, &dev->samplers); + if (result != VK_SUCCESS) + goto fail_images; + + result = hk_descriptor_table_init( + dev, &dev->occlusion_queries, sizeof(uint64_t), AGX_MAX_OCCLUSION_QUERIES, + AGX_MAX_OCCLUSION_QUERIES); + if (result != VK_SUCCESS) + goto fail_samplers; + + result = hk_upload_rodata(dev); + if (result != VK_SUCCESS) + goto fail_queries; + + /* Depends on rodata */ + hk_upload_null_descriptors(dev); + + /* XXX: error handling, and should this even go on the device? */ + agx_bg_eot_init(&dev->bg_eot, &dev->dev); + if (!dev->bg_eot.ht) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail_rodata; + } + + result = hk_init_internal_shaders(&dev->prolog_epilog); + if (result != VK_SUCCESS) + goto fail_bg_eot; + + result = hk_init_internal_shaders(&dev->kernels); + if (result != VK_SUCCESS) + goto fail_internal_shaders; + + result = + hk_queue_init(dev, &dev->queue, &pCreateInfo->pQueueCreateInfos[0], 0); + if (result != VK_SUCCESS) + goto fail_internal_shaders_2; + + struct vk_pipeline_cache_create_info cache_info = { + .weak_ref = true, + }; + dev->mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL); + if (dev->mem_cache == NULL) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail_queue; + } + + result = hk_device_init_meta(dev); + if (result != VK_SUCCESS) + goto fail_mem_cache; + + *pDevice = hk_device_to_handle(dev); + + agx_scratch_init(&dev->dev, &dev->scratch.vs); + agx_scratch_init(&dev->dev, &dev->scratch.fs); + agx_scratch_init(&dev->dev, &dev->scratch.cs); + + return VK_SUCCESS; + +fail_mem_cache: + vk_pipeline_cache_destroy(dev->mem_cache, NULL); +fail_queue: + hk_queue_finish(dev, &dev->queue); +fail_rodata: + agx_bo_unreference(dev->rodata.bo); +fail_bg_eot: + agx_bg_eot_cleanup(&dev->bg_eot); +fail_internal_shaders_2: + hk_destroy_internal_shaders(dev, &dev->kernels, false); +fail_internal_shaders: + hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true); +fail_queries: + hk_descriptor_table_finish(dev, &dev->occlusion_queries); +fail_samplers: + hk_destroy_sampler_heap(dev, &dev->samplers); +fail_images: + hk_descriptor_table_finish(dev, &dev->images); +fail_dev: + agx_close_device(&dev->dev); +fail_fd: + close(dev->dev.fd); +fail_init: + vk_device_finish(&dev->vk); +fail_alloc: + vk_free(&dev->vk.alloc, dev); + return result; +} + +VKAPI_ATTR void VKAPI_CALL +hk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_device, dev, _device); + + if (!dev) + return; + + hk_device_finish_meta(dev); + hk_destroy_internal_shaders(dev, &dev->kernels, false); + hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true); + + vk_pipeline_cache_destroy(dev->mem_cache, NULL); + hk_queue_finish(dev, &dev->queue); + vk_device_finish(&dev->vk); + + agx_scratch_fini(&dev->scratch.vs); + agx_scratch_fini(&dev->scratch.fs); + agx_scratch_fini(&dev->scratch.cs); + + hk_destroy_sampler_heap(dev, &dev->samplers); + hk_descriptor_table_finish(dev, &dev->images); + hk_descriptor_table_finish(dev, &dev->occlusion_queries); + agx_bo_unreference(dev->rodata.bo); + agx_bo_unreference(dev->heap); + agx_bg_eot_cleanup(&dev->bg_eot); + agx_close_device(&dev->dev); + vk_free(&dev->vk.alloc, dev); +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_GetCalibratedTimestampsKHR( + VkDevice _device, uint32_t timestampCount, + const VkCalibratedTimestampInfoKHR *pTimestampInfos, uint64_t *pTimestamps, + uint64_t *pMaxDeviation) +{ + // VK_FROM_HANDLE(hk_device, dev, _device); + // struct hk_physical_device *pdev = hk_device_physical(dev); + uint64_t max_clock_period = 0; + uint64_t begin, end; + int d; + +#ifdef CLOCK_MONOTONIC_RAW + begin = vk_clock_gettime(CLOCK_MONOTONIC_RAW); +#else + begin = vk_clock_gettime(CLOCK_MONOTONIC); +#endif + + for (d = 0; d < timestampCount; d++) { + switch (pTimestampInfos[d].timeDomain) { + case VK_TIME_DOMAIN_DEVICE_KHR: + unreachable("todo"); + // pTimestamps[d] = agx_get_gpu_timestamp(&pdev->dev); + max_clock_period = MAX2( + max_clock_period, 1); /* FIXME: Is timestamp period actually 1? */ + break; + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR: + pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC); + max_clock_period = MAX2(max_clock_period, 1); + break; + +#ifdef CLOCK_MONOTONIC_RAW + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR: + pTimestamps[d] = begin; + break; +#endif + default: + pTimestamps[d] = 0; + break; + } + } + +#ifdef CLOCK_MONOTONIC_RAW + end = vk_clock_gettime(CLOCK_MONOTONIC_RAW); +#else + end = vk_clock_gettime(CLOCK_MONOTONIC); +#endif + + *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period); + + return VK_SUCCESS; +} diff --git a/src/asahi/vulkan/hk_device.h b/src/asahi/vulkan/hk_device.h new file mode 100644 index 00000000000..b6c57315390 --- /dev/null +++ b/src/asahi/vulkan/hk_device.h @@ -0,0 +1,123 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "asahi/lib/agx_device.h" +#include "agx_bg_eot.h" +#include "agx_pack.h" +#include "agx_scratch.h" +#include "decode.h" +#include "vk_cmd_queue.h" +#include "vk_dispatch_table.h" + +#include "hk_private.h" + +#include "hk_descriptor_table.h" +#include "hk_queue.h" +#include "vk_device.h" +#include "vk_meta.h" +#include "vk_queue.h" + +struct hk_physical_device; +struct vk_pipeline_cache; + +/* Fixed offsets for reserved null image descriptors */ +#define HK_NULL_TEX_OFFSET (0) +#define HK_NULL_PBE_OFFSET (24) + +typedef void (*hk_internal_builder_t)(struct nir_builder *b, const void *key); + +struct hk_internal_key { + hk_internal_builder_t builder; + size_t key_size; + uint8_t key[]; +}; + +struct hk_internal_shaders { + simple_mtx_t lock; + struct hash_table *ht; +}; + +struct hk_rc_sampler { + struct agx_sampler_packed key; + + /* Reference count for this hardware sampler, protected by the heap mutex */ + uint16_t refcount; + + /* Index of this hardware sampler in the hardware sampler heap */ + uint16_t index; +}; + +struct hk_sampler_heap { + simple_mtx_t lock; + + struct hk_descriptor_table table; + + /* Map of agx_sampler_packed to hk_rc_sampler */ + struct hash_table *ht; +}; + +struct hk_device { + struct vk_device vk; + struct agx_device dev; + struct agxdecode_ctx *decode_ctx; + + struct hk_descriptor_table images; + struct hk_descriptor_table occlusion_queries; + struct hk_sampler_heap samplers; + + struct hk_queue queue; + + struct vk_pipeline_cache *mem_cache; + + struct vk_meta_device meta; + struct agx_bg_eot_cache bg_eot; + + struct { + struct agx_bo *bo; + struct agx_usc_sampler_packed txf_sampler; + struct agx_usc_uniform_packed image_heap; + uint64_t null_sink, zero_sink; + uint64_t geometry_state; + } rodata; + + struct hk_internal_shaders prolog_epilog; + struct hk_internal_shaders kernels; + struct hk_api_shader *write_shader; + + /* Indirected for common secondary emulation */ + struct vk_device_dispatch_table cmd_dispatch; + + /* Heap used for GPU-side memory allocation for geometry/tessellation. + * + * Control streams accessing the heap must be serialized. This is not + * expected to be a legitimate problem. If it is, we can rework later. + */ + struct agx_bo *heap; + + struct { + struct agx_scratch vs, fs, cs; + } scratch; +}; + +VK_DEFINE_HANDLE_CASTS(hk_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) + +static inline struct hk_physical_device * +hk_device_physical(struct hk_device *dev) +{ + return (struct hk_physical_device *)dev->vk.physical; +} + +VkResult hk_device_init_meta(struct hk_device *dev); +void hk_device_finish_meta(struct hk_device *dev); + +VkResult hk_sampler_heap_add(struct hk_device *dev, + struct agx_sampler_packed desc, + struct hk_rc_sampler **out); + +void hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc); diff --git a/src/asahi/vulkan/hk_device_memory.c b/src/asahi/vulkan/hk_device_memory.c new file mode 100644 index 00000000000..0d10a55f5df --- /dev/null +++ b/src/asahi/vulkan/hk_device_memory.c @@ -0,0 +1,330 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_device_memory.h" + +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_image.h" +#include "hk_physical_device.h" + +#include "asahi/lib/agx_bo.h" +#include "util/u_atomic.h" + +#include +#include + +/* Supports opaque fd only */ +const VkExternalMemoryProperties hk_opaque_fd_mem_props = { + .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT | + VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT, + .exportFromImportedHandleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, + .compatibleHandleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, +}; + +/* Supports opaque fd and dma_buf. */ +const VkExternalMemoryProperties hk_dma_buf_mem_props = { + .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT | + VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT, + .exportFromImportedHandleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT | + VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT, + .compatibleHandleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT | + VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT, +}; + +static enum agx_bo_flags +hk_memory_type_flags(const VkMemoryType *type, + VkExternalMemoryHandleTypeFlagBits handle_types) +{ + unsigned flags = 0; + + if (handle_types) + flags |= AGX_BO_SHARED | AGX_BO_SHAREABLE; + + return flags; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_GetMemoryFdPropertiesKHR(VkDevice device, + VkExternalMemoryHandleTypeFlagBits handleType, + int fd, + VkMemoryFdPropertiesKHR *pMemoryFdProperties) +{ + VK_FROM_HANDLE(hk_device, dev, device); + struct hk_physical_device *pdev = hk_device_physical(dev); + struct agx_bo *bo; + + switch (handleType) { + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: + bo = agx_bo_import(&dev->dev, fd); + if (bo == NULL) + return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE); + break; + default: + return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE); + } + + uint32_t type_bits = 0; + for (unsigned t = 0; t < ARRAY_SIZE(pdev->mem_types); t++) { + const unsigned flags = + hk_memory_type_flags(&pdev->mem_types[t], handleType); + if (!(flags & ~bo->flags)) + type_bits |= (1 << t); + } + + pMemoryFdProperties->memoryTypeBits = type_bits; + + agx_bo_unreference(bo); + + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_AllocateMemory(VkDevice device, const VkMemoryAllocateInfo *pAllocateInfo, + const VkAllocationCallbacks *pAllocator, VkDeviceMemory *pMem) +{ + VK_FROM_HANDLE(hk_device, dev, device); + struct hk_physical_device *pdev = hk_device_physical(dev); + struct hk_device_memory *mem; + VkResult result = VK_SUCCESS; + + const VkImportMemoryFdInfoKHR *fd_info = + vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR); + const VkExportMemoryAllocateInfo *export_info = + vk_find_struct_const(pAllocateInfo->pNext, EXPORT_MEMORY_ALLOCATE_INFO); + const VkMemoryType *type = &pdev->mem_types[pAllocateInfo->memoryTypeIndex]; + + VkExternalMemoryHandleTypeFlagBits handle_types = 0; + if (export_info != NULL) + handle_types |= export_info->handleTypes; + if (fd_info != NULL) + handle_types |= fd_info->handleType; + + const unsigned flags = hk_memory_type_flags(type, handle_types); + + uint32_t alignment = 16384; /* Apple page size */ + + struct hk_memory_heap *heap = &pdev->mem_heaps[type->heapIndex]; + if (p_atomic_read(&heap->used) > heap->size) + return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + const uint64_t aligned_size = + align64(pAllocateInfo->allocationSize, alignment); + + mem = vk_device_memory_create(&dev->vk, pAllocateInfo, pAllocator, + sizeof(*mem)); + if (!mem) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + mem->map = NULL; + if (fd_info && fd_info->handleType) { + assert( + fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT || + fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); + + mem->bo = agx_bo_import(&dev->dev, fd_info->fd); + if (mem->bo == NULL) { + result = vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE); + goto fail_alloc; + } + assert(!(flags & ~mem->bo->flags)); + } else { + enum agx_bo_flags flags = 0; + if (handle_types) + flags |= AGX_BO_SHAREABLE; + + mem->bo = agx_bo_create(&dev->dev, aligned_size, flags, "App memory"); + if (!mem->bo) { + result = vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto fail_alloc; + } + } + + if (fd_info && fd_info->handleType) { + /* From the Vulkan spec: + * + * "Importing memory from a file descriptor transfers ownership of + * the file descriptor from the application to the Vulkan + * implementation. The application must not perform any operations on + * the file descriptor after a successful import." + * + * If the import fails, we leave the file descriptor open. + */ + close(fd_info->fd); + } + + uint64_t heap_used = p_atomic_add_return(&heap->used, mem->bo->size); + if (heap_used > heap->size) { + hk_FreeMemory(device, hk_device_memory_to_handle(mem), pAllocator); + return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Out of heap memory"); + } + + *pMem = hk_device_memory_to_handle(mem); + + return VK_SUCCESS; + +fail_alloc: + vk_device_memory_destroy(&dev->vk, pAllocator, &mem->vk); + return result; +} + +VKAPI_ATTR void VKAPI_CALL +hk_FreeMemory(VkDevice device, VkDeviceMemory _mem, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_device_memory, mem, _mem); + struct hk_physical_device *pdev = hk_device_physical(dev); + + if (!mem) + return; + + const VkMemoryType *type = &pdev->mem_types[mem->vk.memory_type_index]; + struct hk_memory_heap *heap = &pdev->mem_heaps[type->heapIndex]; + p_atomic_add(&heap->used, -((int64_t)mem->bo->size)); + + agx_bo_unreference(mem->bo); + + vk_device_memory_destroy(&dev->vk, pAllocator, &mem->vk); +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_MapMemory2KHR(VkDevice device, const VkMemoryMapInfoKHR *pMemoryMapInfo, + void **ppData) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_device_memory, mem, pMemoryMapInfo->memory); + + if (mem == NULL) { + *ppData = NULL; + return VK_SUCCESS; + } + + const VkDeviceSize offset = pMemoryMapInfo->offset; + const VkDeviceSize size = vk_device_memory_range( + &mem->vk, pMemoryMapInfo->offset, pMemoryMapInfo->size); + + UNUSED void *fixed_addr = NULL; + if (pMemoryMapInfo->flags & VK_MEMORY_MAP_PLACED_BIT_EXT) { + const VkMemoryMapPlacedInfoEXT *placed_info = vk_find_struct_const( + pMemoryMapInfo->pNext, MEMORY_MAP_PLACED_INFO_EXT); + fixed_addr = placed_info->pPlacedAddress; + } + + /* From the Vulkan spec version 1.0.32 docs for MapMemory: + * + * * If size is not equal to VK_WHOLE_SIZE, size must be greater than 0 + * assert(size != 0); + * * If size is not equal to VK_WHOLE_SIZE, size must be less than or + * equal to the size of the memory minus offset + */ + assert(size > 0); + assert(offset + size <= mem->bo->size); + + if (size != (size_t)size) { + return vk_errorf(dev, VK_ERROR_MEMORY_MAP_FAILED, + "requested size 0x%" PRIx64 " does not fit in %u bits", + size, (unsigned)(sizeof(size_t) * 8)); + } + + /* From the Vulkan 1.2.194 spec: + * + * "memory must not be currently host mapped" + */ + if (mem->map != NULL) { + return vk_errorf(dev, VK_ERROR_MEMORY_MAP_FAILED, + "Memory object already mapped."); + } + + mem->map = mem->bo->ptr.cpu; + *ppData = mem->map + offset; + + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_UnmapMemory2KHR(VkDevice device, + const VkMemoryUnmapInfoKHR *pMemoryUnmapInfo) +{ + VK_FROM_HANDLE(hk_device_memory, mem, pMemoryUnmapInfo->memory); + + if (mem == NULL) + return VK_SUCCESS; + + if (pMemoryUnmapInfo->flags & VK_MEMORY_UNMAP_RESERVE_BIT_EXT) { + unreachable("todo"); +#if 0 + VK_FROM_HANDLE(hk_device, dev, device); + + int err = agx_bo_overmap(mem->bo, mem->map); + if (err) { + return vk_errorf(dev, VK_ERROR_MEMORY_MAP_FAILED, + "Failed to map over original mapping"); + } +#endif + } else { + /* TODO */ + //// agx_bo_unmap(mem->bo, mem->map); + } + + mem->map = NULL; + + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_FlushMappedMemoryRanges(VkDevice device, uint32_t memoryRangeCount, + const VkMappedMemoryRange *pMemoryRanges) +{ + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_InvalidateMappedMemoryRanges(VkDevice device, uint32_t memoryRangeCount, + const VkMappedMemoryRange *pMemoryRanges) +{ + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetDeviceMemoryCommitment(VkDevice device, VkDeviceMemory _mem, + VkDeviceSize *pCommittedMemoryInBytes) +{ + VK_FROM_HANDLE(hk_device_memory, mem, _mem); + + *pCommittedMemoryInBytes = mem->bo->size; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_GetMemoryFdKHR(VkDevice device, const VkMemoryGetFdInfoKHR *pGetFdInfo, + int *pFD) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_device_memory, memory, pGetFdInfo->memory); + + switch (pGetFdInfo->handleType) { + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: + *pFD = agx_bo_export(memory->bo); + return VK_SUCCESS; + default: + assert(!"unsupported handle type"); + return vk_error(dev, VK_ERROR_FEATURE_NOT_PRESENT); + } +} + +VKAPI_ATTR uint64_t VKAPI_CALL +hk_GetDeviceMemoryOpaqueCaptureAddress( + UNUSED VkDevice device, const VkDeviceMemoryOpaqueCaptureAddressInfo *pInfo) +{ + VK_FROM_HANDLE(hk_device_memory, mem, pInfo->memory); + + return mem->bo->ptr.gpu; +} diff --git a/src/asahi/vulkan/hk_device_memory.h b/src/asahi/vulkan/hk_device_memory.h new file mode 100644 index 00000000000..29d3651972a --- /dev/null +++ b/src/asahi/vulkan/hk_device_memory.h @@ -0,0 +1,31 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "hk_private.h" + +#include "vk_device_memory.h" + +#include "util/list.h" + +struct hk_device; +struct hk_image_plane; + +struct hk_device_memory { + struct vk_device_memory vk; + + struct agx_bo *bo; + + void *map; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_device_memory, vk.base, VkDeviceMemory, + VK_OBJECT_TYPE_DEVICE_MEMORY) + +extern const VkExternalMemoryProperties hk_opaque_fd_mem_props; +extern const VkExternalMemoryProperties hk_dma_buf_mem_props; diff --git a/src/asahi/vulkan/hk_event.c b/src/asahi/vulkan/hk_event.c new file mode 100644 index 00000000000..aadbb272e76 --- /dev/null +++ b/src/asahi/vulkan/hk_event.c @@ -0,0 +1,113 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_event.h" +#include "vulkan/vulkan_core.h" + +#include "agx_bo.h" +#include "hk_cmd_buffer.h" +#include "hk_device.h" +#include "hk_entrypoints.h" + +#define HK_EVENT_MEM_SIZE sizeof(VkResult) + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateEvent(VkDevice device, const VkEventCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, VkEvent *pEvent) +{ + VK_FROM_HANDLE(hk_device, dev, device); + struct hk_event *event; + + event = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*event), + VK_OBJECT_TYPE_EVENT); + if (!event) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + /* TODO: this is really wasteful, bring back the NVK heap! + * + * XXX + */ + event->bo = + agx_bo_create(&dev->dev, HK_EVENT_MEM_SIZE, AGX_BO_WRITEBACK, "Event"); + event->status = event->bo->ptr.cpu; + event->addr = event->bo->ptr.gpu; + + *event->status = VK_EVENT_RESET; + + *pEvent = hk_event_to_handle(event); + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +hk_DestroyEvent(VkDevice device, VkEvent _event, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_event, event, _event); + + if (!event) + return; + + agx_bo_unreference(event->bo); + vk_object_free(&dev->vk, pAllocator, event); +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_GetEventStatus(VkDevice device, VkEvent _event) +{ + VK_FROM_HANDLE(hk_event, event, _event); + + return *event->status; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_SetEvent(VkDevice device, VkEvent _event) +{ + VK_FROM_HANDLE(hk_event, event, _event); + + *event->status = VK_EVENT_SET; + + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_ResetEvent(VkDevice device, VkEvent _event) +{ + VK_FROM_HANDLE(hk_event, event, _event); + + *event->status = VK_EVENT_RESET; + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, + const VkDependencyInfo *pDependencyInfo) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_event, event, _event); + + hk_queue_write(cmd, event->bo->ptr.gpu, VK_EVENT_SET, false); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, + VkPipelineStageFlags2 stageMask) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_event, event, _event); + + hk_queue_write(cmd, event->bo->ptr.gpu, VK_EVENT_RESET, false); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, + const VkEvent *pEvents, + const VkDependencyInfo *pDependencyInfos) +{ + /* Currently we barrier everything, so this is a no-op. */ +} diff --git a/src/asahi/vulkan/hk_event.h b/src/asahi/vulkan/hk_event.h new file mode 100644 index 00000000000..c675ceada8a --- /dev/null +++ b/src/asahi/vulkan/hk_event.h @@ -0,0 +1,22 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "hk_private.h" + +#include "vk_object.h" + +struct hk_event { + struct vk_object_base base; + struct agx_bo *bo; + + uint64_t addr; + VkResult *status; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_event, base, VkEvent, VK_OBJECT_TYPE_EVENT) diff --git a/src/asahi/vulkan/hk_format.c b/src/asahi/vulkan/hk_format.c new file mode 100644 index 00000000000..b0fa8ae5c99 --- /dev/null +++ b/src/asahi/vulkan/hk_format.c @@ -0,0 +1,140 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "drm-uapi/drm_fourcc.h" + +#include "hk_buffer_view.h" +#include "hk_entrypoints.h" +#include "hk_image.h" +#include "hk_physical_device.h" + +#include "vk_enum_defines.h" +#include "vk_format.h" + +uint64_t agx_best_modifiers[] = { + // DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED, + DRM_FORMAT_MOD_APPLE_TWIDDLED, + DRM_FORMAT_MOD_LINEAR, +}; + +static VkFormatFeatureFlags2 +hk_modifier_features(uint64_t mod, VkFormat vk_format, + const VkFormatProperties *props) +{ + if (mod == DRM_FORMAT_MOD_LINEAR) + return props->linearTilingFeatures; + + if (mod == DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED /* TODO */) + return 0; + + return props->optimalTilingFeatures; +} + +static void +get_drm_format_modifier_properties_list( + const struct hk_physical_device *physical_device, VkFormat vk_format, + VkDrmFormatModifierPropertiesListEXT *list, const VkFormatProperties *props) +{ + VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out, + list->pDrmFormatModifierProperties, + &list->drmFormatModifierCount); + + for (unsigned i = 0; i < ARRAY_SIZE(agx_best_modifiers); ++i) { + uint64_t mod = agx_best_modifiers[i]; + VkFormatFeatureFlags2 flags = hk_modifier_features(mod, vk_format, props); + + if (!flags) + continue; + + vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, &out, + out_props) + { + *out_props = (VkDrmFormatModifierPropertiesEXT){ + .drmFormatModifier = mod, + .drmFormatModifierPlaneCount = 1 /* no planar mods */, + .drmFormatModifierTilingFeatures = flags, + }; + }; + } +} + +static void +get_drm_format_modifier_properties_list_2( + const struct hk_physical_device *physical_device, VkFormat vk_format, + VkDrmFormatModifierPropertiesList2EXT *list, const VkFormatProperties *props) +{ + VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierProperties2EXT, out, + list->pDrmFormatModifierProperties, + &list->drmFormatModifierCount); + + for (unsigned i = 0; i < ARRAY_SIZE(agx_best_modifiers); ++i) { + uint64_t mod = agx_best_modifiers[i]; + VkFormatFeatureFlags2 flags = hk_modifier_features(mod, vk_format, props); + + if (!flags) + continue; + + vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT, &out, + out_props) + { + *out_props = (VkDrmFormatModifierProperties2EXT){ + .drmFormatModifier = mod, + .drmFormatModifierPlaneCount = 1, /* no planar mods */ + .drmFormatModifierTilingFeatures = flags, + }; + }; + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice, + VkFormat format, + VkFormatProperties2 *pFormatProperties) +{ + VK_FROM_HANDLE(hk_physical_device, pdevice, physicalDevice); + + VkFormatFeatureFlags2 linear2, optimal2, buffer2; + linear2 = + hk_get_image_format_features(pdevice, format, VK_IMAGE_TILING_LINEAR); + optimal2 = + hk_get_image_format_features(pdevice, format, VK_IMAGE_TILING_OPTIMAL); + buffer2 = hk_get_buffer_format_features(pdevice, format); + + pFormatProperties->formatProperties = (VkFormatProperties){ + .linearTilingFeatures = vk_format_features2_to_features(linear2), + .optimalTilingFeatures = vk_format_features2_to_features(optimal2), + .bufferFeatures = vk_format_features2_to_features(buffer2), + }; + + vk_foreach_struct(ext, pFormatProperties->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3: { + VkFormatProperties3 *p = (void *)ext; + p->linearTilingFeatures = linear2; + p->optimalTilingFeatures = optimal2; + p->bufferFeatures = buffer2; + break; + } + + case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT: + get_drm_format_modifier_properties_list( + pdevice, format, (void *)ext, &pFormatProperties->formatProperties); + break; + + case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT: + get_drm_format_modifier_properties_list_2( + pdevice, format, (void *)ext, &pFormatProperties->formatProperties); + break; + + default: + vk_debug_ignored_stype(ext->sType); + break; + } + } +} diff --git a/src/asahi/vulkan/hk_image.c b/src/asahi/vulkan/hk_image.c new file mode 100644 index 00000000000..6187eff40a8 --- /dev/null +++ b/src/asahi/vulkan/hk_image.c @@ -0,0 +1,1536 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_image.h" +#include "asahi/layout/layout.h" +#include "asahi/lib/agx_formats.h" +#include "drm-uapi/drm_fourcc.h" +#include "util/bitscan.h" +#include "util/format/u_format.h" +#include "util/format/u_formats.h" +#include "util/macros.h" +#include "util/u_math.h" +#include "vulkan/vulkan_core.h" + +#include "hk_device.h" +#include "hk_device_memory.h" +#include "hk_entrypoints.h" +#include "hk_physical_device.h" + +#include "vk_format.h" + +/* Minimum alignment encodable for our descriptors. The hardware texture/PBE + * descriptors require 16-byte alignment. Our software PBE atomic descriptor + * requires 128-byte alignment, but we could relax that one if we wanted. + */ +#define HK_PLANE_ALIGN_B 128 + +static VkFormatFeatureFlags2 +hk_get_image_plane_format_features(struct hk_physical_device *pdev, + VkFormat vk_format, VkImageTiling tiling) +{ + VkFormatFeatureFlags2 features = 0; + + /* Conformance fails with these optional formats. Just drop them for now. + * TODO: Investigate later if we have a use case. + */ + switch (vk_format) { + case VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR: + case VK_FORMAT_A8_UNORM_KHR: + return 0; + default: + break; + } + + enum pipe_format p_format = vk_format_to_pipe_format(vk_format); + if (p_format == PIPE_FORMAT_NONE) + return 0; + + /* NPOT formats only supported for texel buffers */ + if (!util_is_power_of_two_nonzero(util_format_get_blocksize(p_format))) + return 0; + + if (util_format_is_compressed(p_format)) { + /* Linear block-compressed images are all sorts of problematic, not sure + * if AGX even supports them. Don't try. + */ + if (tiling != VK_IMAGE_TILING_OPTIMAL) + return 0; + + /* XXX: Conformance fails, e.g.: + * dEQP-VK.pipeline.monolithic.sampler.view_type.2d.format.etc2_r8g8b8a1_unorm_block.mipmap.linear.lod.select_bias_3_7 + * + * I suspect ail bug with mipmapping of compressed :-/ + */ + switch (util_format_description(p_format)->layout) { + case UTIL_FORMAT_LAYOUT_ETC: + case UTIL_FORMAT_LAYOUT_ASTC: + return 0; + default: + break; + } + } + + if (agx_pixel_format[p_format].texturable) { + features |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT; + features |= VK_FORMAT_FEATURE_2_BLIT_SRC_BIT; + + /* We can sample integer formats but it doesn't make sense to linearly + * filter them. + */ + if (!util_format_is_pure_integer(p_format)) { + features |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + } + + if (vk_format_has_depth(vk_format)) { + features |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT; + } + } + + if (agx_pixel_format[p_format].renderable) { + /* For now, disable snorm rendering due to nir_lower_blend bugs. + * + * TODO: revisit. + */ + if (!util_format_is_snorm(p_format)) { + features |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT; + features |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT; + } + + features |= VK_FORMAT_FEATURE_2_BLIT_DST_BIT; + features |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT | + VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT | + VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT; + } + + if (vk_format_is_depth_or_stencil(vk_format)) { + if (!(p_format == PIPE_FORMAT_Z32_FLOAT || + p_format == PIPE_FORMAT_S8_UINT || + p_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT || + p_format == PIPE_FORMAT_Z16_UNORM) || + tiling == VK_IMAGE_TILING_LINEAR) + return 0; + + features |= VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT; + } + + /* Our image atomic lowering doesn't bother to handle linear */ + if ((p_format == PIPE_FORMAT_R32_UINT || p_format == PIPE_FORMAT_R32_SINT) && + tiling == VK_IMAGE_TILING_OPTIMAL) { + + features |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT; + } + + if (features != 0) { + features |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT; + features |= VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT; + features |= VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT; + } + + return features; +} + +VkFormatFeatureFlags2 +hk_get_image_format_features(struct hk_physical_device *pdev, + VkFormat vk_format, VkImageTiling tiling) +{ + const struct vk_format_ycbcr_info *ycbcr_info = + vk_format_get_ycbcr_info(vk_format); + if (ycbcr_info == NULL) + return hk_get_image_plane_format_features(pdev, vk_format, tiling); + + /* For multi-plane, we get the feature flags of each plane separately, + * then take their intersection as the overall format feature flags + */ + VkFormatFeatureFlags2 features = ~0ull; + bool cosited_chroma = false; + for (uint8_t plane = 0; plane < ycbcr_info->n_planes; plane++) { + const struct vk_format_ycbcr_plane *plane_info = + &ycbcr_info->planes[plane]; + features &= + hk_get_image_plane_format_features(pdev, plane_info->format, tiling); + if (plane_info->denominator_scales[0] > 1 || + plane_info->denominator_scales[1] > 1) + cosited_chroma = true; + } + if (features == 0) + return 0; + + /* Uh... We really should be able to sample from YCbCr */ + assert(features & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT); + assert(features & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT); + + /* These aren't allowed for YCbCr formats */ + features &= + ~(VK_FORMAT_FEATURE_2_BLIT_SRC_BIT | VK_FORMAT_FEATURE_2_BLIT_DST_BIT | + VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT | + VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT); + + /* This is supported on all YCbCr formats */ + features |= + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT; + + if (ycbcr_info->n_planes > 1) { + /* DISJOINT_BIT implies that each plane has its own separate binding, + * while SEPARATE_RECONSTRUCTION_FILTER_BIT implies that luma and chroma + * each have their own, separate filters, so these two bits make sense + * for multi-planar formats only. + * + * For MIDPOINT_CHROMA_SAMPLES_BIT, NVIDIA HW on single-plane interleaved + * YCbCr defaults to COSITED_EVEN, which is inaccurate and fails tests. + * This can be fixed with a NIR tweak but for now, we only enable this bit + * for multi-plane formats. See Issue #9525 on the mesa/main tracker. + */ + features |= + VK_FORMAT_FEATURE_DISJOINT_BIT | + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT | + VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT; + } + + if (cosited_chroma) + features |= VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT; + + return features; +} + +static VkFormatFeatureFlags2 +vk_image_usage_to_format_features(VkImageUsageFlagBits usage_flag) +{ + assert(util_bitcount(usage_flag) == 1); + switch (usage_flag) { + case VK_IMAGE_USAGE_TRANSFER_SRC_BIT: + return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | + VK_FORMAT_FEATURE_BLIT_SRC_BIT; + case VK_IMAGE_USAGE_TRANSFER_DST_BIT: + return VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT | + VK_FORMAT_FEATURE_BLIT_DST_BIT; + case VK_IMAGE_USAGE_SAMPLED_BIT: + return VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT; + case VK_IMAGE_USAGE_STORAGE_BIT: + return VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT; + case VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT: + return VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT; + case VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT: + return VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT; + default: + return 0; + } +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_GetPhysicalDeviceImageFormatProperties2( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceImageFormatInfo2 *pImageFormatInfo, + VkImageFormatProperties2 *pImageFormatProperties) +{ + VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice); + + const VkPhysicalDeviceExternalImageFormatInfo *external_info = + vk_find_struct_const(pImageFormatInfo->pNext, + PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO); + + /* Initialize to zero in case we return VK_ERROR_FORMAT_NOT_SUPPORTED */ + memset(&pImageFormatProperties->imageFormatProperties, 0, + sizeof(pImageFormatProperties->imageFormatProperties)); + + const struct vk_format_ycbcr_info *ycbcr_info = + vk_format_get_ycbcr_info(pImageFormatInfo->format); + + /* For the purposes of these checks, we don't care about all the extra + * YCbCr features and we just want the accumulation of features available + * to all planes of the given format. + */ + VkFormatFeatureFlags2 features; + if (ycbcr_info == NULL) { + features = hk_get_image_plane_format_features( + pdev, pImageFormatInfo->format, pImageFormatInfo->tiling); + } else { + features = ~0ull; + assert(ycbcr_info->n_planes > 0); + for (uint8_t plane = 0; plane < ycbcr_info->n_planes; plane++) { + const VkFormat plane_format = ycbcr_info->planes[plane].format; + features &= hk_get_image_plane_format_features( + pdev, plane_format, pImageFormatInfo->tiling); + } + } + if (features == 0) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + + if (pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR && + pImageFormatInfo->type != VK_IMAGE_TYPE_2D) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + + if (ycbcr_info && pImageFormatInfo->type != VK_IMAGE_TYPE_2D) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + + /* From the Vulkan 1.3.279 spec: + * + * VUID-VkImageCreateInfo-tiling-04121 + * + * "If tiling is VK_IMAGE_TILING_LINEAR, flags must not contain + * VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT" + * + * VUID-VkImageCreateInfo-imageType-00970 + * + * "If imageType is VK_IMAGE_TYPE_1D, flags must not contain + * VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT" + */ + if (pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT && + (pImageFormatInfo->type == VK_IMAGE_TYPE_1D || + pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR)) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + + /* From the Vulkan 1.3.279 spec: + * + * VUID-VkImageCreateInfo-flags-09403 + * + * "If flags contains VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT, flags + * must not include VK_IMAGE_CREATE_SPARSE_ALIASED_BIT, + * VK_IMAGE_CREATE_SPARSE_BINDING_BIT, or + * VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT" + */ + if ((pImageFormatInfo->flags & VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT) && + (pImageFormatInfo->flags & (VK_IMAGE_CREATE_SPARSE_ALIASED_BIT | + VK_IMAGE_CREATE_SPARSE_BINDING_BIT | + VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT))) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + + /* We don't yet support sparse, but it shouldn't be too hard */ + if (pImageFormatInfo->flags & (VK_IMAGE_CREATE_SPARSE_ALIASED_BIT | + VK_IMAGE_CREATE_SPARSE_BINDING_BIT | + VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT)) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + + const uint32_t max_dim = 16384; + VkExtent3D maxExtent; + uint32_t maxArraySize; + switch (pImageFormatInfo->type) { + case VK_IMAGE_TYPE_1D: + maxExtent = (VkExtent3D){max_dim, 1, 1}; + maxArraySize = 2048; + break; + case VK_IMAGE_TYPE_2D: + maxExtent = (VkExtent3D){max_dim, max_dim, 1}; + maxArraySize = 2048; + break; + case VK_IMAGE_TYPE_3D: + maxExtent = (VkExtent3D){max_dim, max_dim, max_dim}; + maxArraySize = 1; + break; + default: + unreachable("Invalid image type"); + } + if (pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR) + maxArraySize = 1; + + assert(util_is_power_of_two_nonzero(max_dim)); + uint32_t maxMipLevels = util_logbase2(max_dim) + 1; + if (ycbcr_info != NULL || pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR) + maxMipLevels = 1; + + VkSampleCountFlags sampleCounts = VK_SAMPLE_COUNT_1_BIT; + if (pImageFormatInfo->tiling == VK_IMAGE_TILING_OPTIMAL && + pImageFormatInfo->type == VK_IMAGE_TYPE_2D && ycbcr_info == NULL && + (features & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) && + !(pImageFormatInfo->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)) { + + sampleCounts = + VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT; + } + + /* From the Vulkan 1.2.199 spec: + * + * "VK_IMAGE_CREATE_EXTENDED_USAGE_BIT specifies that the image can be + * created with usage flags that are not supported for the format the + * image is created with but are supported for at least one format a + * VkImageView created from the image can have." + * + * If VK_IMAGE_CREATE_EXTENDED_USAGE_BIT is set, views can be created with + * different usage than the image so we can't always filter on usage. + * There is one exception to this below for storage. + */ + const VkImageUsageFlags image_usage = pImageFormatInfo->usage; + VkImageUsageFlags view_usage = image_usage; + if (pImageFormatInfo->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT) + view_usage = 0; + + if (view_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) { + if (!(features & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT))) { + return VK_ERROR_FORMAT_NOT_SUPPORTED; + } + } + + u_foreach_bit(b, view_usage) { + VkFormatFeatureFlags2 usage_features = + vk_image_usage_to_format_features(1 << b); + if (usage_features && !(features & usage_features)) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + } + + const VkExternalMemoryProperties *ext_mem_props = NULL; + if (external_info != NULL && external_info->handleType != 0) { + bool tiling_has_explicit_layout; + switch (pImageFormatInfo->tiling) { + case VK_IMAGE_TILING_LINEAR: + case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT: + tiling_has_explicit_layout = true; + break; + case VK_IMAGE_TILING_OPTIMAL: + tiling_has_explicit_layout = false; + break; + default: + unreachable("Unsupported VkImageTiling"); + } + + switch (external_info->handleType) { + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: + /* No special restrictions */ + if (tiling_has_explicit_layout) { + /* With an explicit memory layout, we don't care which type of + * fd the image belongs too. Both OPAQUE_FD and DMA_BUF are + * interchangeable here. + */ + ext_mem_props = &hk_dma_buf_mem_props; + } else { + ext_mem_props = &hk_opaque_fd_mem_props; + } + break; + + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: + if (!tiling_has_explicit_layout) { + return vk_errorf(pdev, VK_ERROR_FORMAT_NOT_SUPPORTED, + "VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT " + "requires VK_IMAGE_TILING_LINEAR or " + "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT"); + } + ext_mem_props = &hk_dma_buf_mem_props; + break; + + default: + /* From the Vulkan 1.3.256 spec: + * + * "If handleType is not compatible with the [parameters] in + * VkPhysicalDeviceImageFormatInfo2, then + * vkGetPhysicalDeviceImageFormatProperties2 returns + * VK_ERROR_FORMAT_NOT_SUPPORTED." + */ + return vk_errorf(pdev, VK_ERROR_FORMAT_NOT_SUPPORTED, + "unsupported VkExternalMemoryTypeFlagBits 0x%x", + external_info->handleType); + } + } + + const unsigned plane_count = + vk_format_get_plane_count(pImageFormatInfo->format); + + /* From the Vulkan 1.3.259 spec, VkImageCreateInfo: + * + * VUID-VkImageCreateInfo-imageCreateFormatFeatures-02260 + * + * "If format is a multi-planar format, and if imageCreateFormatFeatures + * (as defined in Image Creation Limits) does not contain + * VK_FORMAT_FEATURE_DISJOINT_BIT, then flags must not contain + * VK_IMAGE_CREATE_DISJOINT_BIT" + * + * This is satisfied trivially because we support DISJOINT on all + * multi-plane formats. Also, + * + * VUID-VkImageCreateInfo-format-01577 + * + * "If format is not a multi-planar format, and flags does not include + * VK_IMAGE_CREATE_ALIAS_BIT, flags must not contain + * VK_IMAGE_CREATE_DISJOINT_BIT" + */ + if (plane_count == 1 && + !(pImageFormatInfo->flags & VK_IMAGE_CREATE_ALIAS_BIT) && + (pImageFormatInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT)) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + + if (ycbcr_info && + ((pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) || + (pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT))) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + + pImageFormatProperties->imageFormatProperties = (VkImageFormatProperties){ + .maxExtent = maxExtent, + .maxMipLevels = maxMipLevels, + .maxArrayLayers = maxArraySize, + .sampleCounts = sampleCounts, + .maxResourceSize = UINT32_MAX, /* TODO */ + }; + + vk_foreach_struct(s, pImageFormatProperties->pNext) { + switch (s->sType) { + case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES: { + VkExternalImageFormatProperties *p = (void *)s; + /* From the Vulkan 1.3.256 spec: + * + * "If handleType is 0, vkGetPhysicalDeviceImageFormatProperties2 + * will behave as if VkPhysicalDeviceExternalImageFormatInfo was + * not present, and VkExternalImageFormatProperties will be + * ignored." + * + * This is true if and only if ext_mem_props == NULL + */ + if (ext_mem_props != NULL) + p->externalMemoryProperties = *ext_mem_props; + break; + } + case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES: { + VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = (void *)s; + ycbcr_props->combinedImageSamplerDescriptorCount = plane_count; + break; + } + case VK_STRUCTURE_TYPE_HOST_IMAGE_COPY_DEVICE_PERFORMANCE_QUERY_EXT: { + VkHostImageCopyDevicePerformanceQueryEXT *hic_props = (void *)s; + + /* TODO: Check compressability */ + hic_props->optimalDeviceAccess = hic_props->identicalMemoryLayout = + true; + break; + } + default: + vk_debug_ignored_stype(s->sType); + break; + } + } + + return VK_SUCCESS; +} + +static VkSparseImageFormatProperties +hk_fill_sparse_image_fmt_props(VkImageAspectFlags aspects) +{ + /* TODO */ + return (VkSparseImageFormatProperties){ + .aspectMask = aspects, + .flags = VK_SPARSE_IMAGE_FORMAT_SINGLE_MIPTAIL_BIT, + .imageGranularity = + { + .width = 1, + .height = 1, + .depth = 1, + }, + }; +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetPhysicalDeviceSparseImageFormatProperties2( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceSparseImageFormatInfo2 *pFormatInfo, + uint32_t *pPropertyCount, VkSparseImageFormatProperties2 *pProperties) +{ + VkResult result; + + /* Check if the given format info is valid first before returning sparse + * props. The easiest way to do this is to just call + * hk_GetPhysicalDeviceImageFormatProperties2() + */ + const VkPhysicalDeviceImageFormatInfo2 img_fmt_info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, + .format = pFormatInfo->format, + .type = pFormatInfo->type, + .tiling = pFormatInfo->tiling, + .usage = pFormatInfo->usage, + .flags = VK_IMAGE_CREATE_SPARSE_BINDING_BIT | + VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT, + }; + + VkImageFormatProperties2 img_fmt_props2 = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2, + .pNext = NULL, + }; + + result = hk_GetPhysicalDeviceImageFormatProperties2( + physicalDevice, &img_fmt_info, &img_fmt_props2); + if (result != VK_SUCCESS) { + *pPropertyCount = 0; + return; + } + + const VkImageFormatProperties *props = &img_fmt_props2.imageFormatProperties; + if (!(pFormatInfo->samples & props->sampleCounts)) { + *pPropertyCount = 0; + return; + } + + VK_OUTARRAY_MAKE_TYPED(VkSparseImageFormatProperties2, out, pProperties, + pPropertyCount); + + VkImageAspectFlags aspects = vk_format_aspects(pFormatInfo->format); + + vk_outarray_append_typed(VkSparseImageFormatProperties2, &out, props) + { + props->properties = hk_fill_sparse_image_fmt_props(aspects); + } +} + +static enum ail_tiling +hk_map_tiling(const VkImageCreateInfo *info, unsigned plane) +{ + switch (info->tiling) { + case VK_IMAGE_TILING_LINEAR: + return AIL_TILING_LINEAR; + + case VK_IMAGE_TILING_OPTIMAL: { + const struct vk_format_ycbcr_info *ycbcr_info = + vk_format_get_ycbcr_info(info->format); + VkFormat format = + ycbcr_info ? ycbcr_info->planes[plane].format : info->format; + + if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + format = (plane == 0) ? VK_FORMAT_D32_SFLOAT : VK_FORMAT_S8_UINT; + } + + const uint8_t width_scale = + ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[0] : 1; + const uint8_t height_scale = + ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[1] : 1; + + if ((info->extent.width / width_scale) < 16 || + (info->extent.height / height_scale) < 16) + return AIL_TILING_TWIDDLED; + + // TODO: lots of bugs to fix first + // return AIL_TILING_TWIDDLED_COMPRESSED; + return AIL_TILING_TWIDDLED; + } + + case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT: + /* TODO */ + return AIL_TILING_TWIDDLED; + default: + unreachable("invalid tiling"); + } +} + +static uint32_t +modifier_get_score(uint64_t mod) +{ + switch (mod) { + case DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED: + return 10; + + case DRM_FORMAT_MOD_APPLE_TWIDDLED: + return 5; + + case DRM_FORMAT_MOD_LINEAR: + return 1; + + default: + return 0; + } +} + +static uint64_t +choose_drm_format_mod(uint32_t modifier_count, const uint64_t *modifiers) +{ + uint64_t best_mod = UINT64_MAX; + uint32_t best_score = 0; + + for (uint32_t i = 0; i < modifier_count; ++i) { + uint32_t score = modifier_get_score(modifiers[i]); + if (score > best_score) { + best_mod = modifiers[i]; + best_score = score; + } + } + + if (best_score > 0) + return best_mod; + else + return DRM_FORMAT_MOD_INVALID; +} + +static VkResult +hk_image_init(struct hk_device *dev, struct hk_image *image, + const VkImageCreateInfo *pCreateInfo) +{ + vk_image_init(&dev->vk, &image->vk, pCreateInfo); + + if ((image->vk.usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) && + image->vk.samples > 1) { + image->vk.usage |= VK_IMAGE_USAGE_SAMPLED_BIT; + image->vk.stencil_usage |= VK_IMAGE_USAGE_SAMPLED_BIT; + } + + if (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) + image->vk.usage |= VK_IMAGE_USAGE_SAMPLED_BIT; + if (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) + image->vk.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + + image->plane_count = vk_format_get_plane_count(pCreateInfo->format); + image->disjoint = image->plane_count > 1 && + (pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT); + + /* We do not support interleaved depth/stencil. Instead, we decompose to + * a depth plane and a stencil plane. + */ + if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + image->plane_count = 2; + } + + if (image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT) { + /* Sparse multiplane is not supported. Sparse depth/stencil not supported + * on G13 so we're fine there too. + */ + assert(image->plane_count == 1); + } + + const struct VkImageDrmFormatModifierExplicitCreateInfoEXT + *mod_explicit_info = NULL; + + if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { + assert(!image->vk.wsi_legacy_scanout); + mod_explicit_info = vk_find_struct_const( + pCreateInfo->pNext, + IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT); + + uint64_t modifier = DRM_FORMAT_MOD_INVALID; + + if (mod_explicit_info) { + modifier = mod_explicit_info->drmFormatModifier; + } else { + const struct VkImageDrmFormatModifierListCreateInfoEXT *mod_list_info = + vk_find_struct_const( + pCreateInfo->pNext, + IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT); + + modifier = choose_drm_format_mod(mod_list_info->drmFormatModifierCount, + mod_list_info->pDrmFormatModifiers); + } + + assert(modifier != DRM_FORMAT_MOD_INVALID); + assert(image->vk.drm_format_mod == DRM_FORMAT_MOD_INVALID); + image->vk.drm_format_mod = modifier; + } + + const struct vk_format_ycbcr_info *ycbcr_info = + vk_format_get_ycbcr_info(pCreateInfo->format); + for (uint8_t plane = 0; plane < image->plane_count; plane++) { + VkFormat format = + ycbcr_info ? ycbcr_info->planes[plane].format : pCreateInfo->format; + + if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + format = (plane == 0) ? VK_FORMAT_D32_SFLOAT : VK_FORMAT_S8_UINT; + } + + const uint8_t width_scale = + ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[0] : 1; + const uint8_t height_scale = + ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[1] : 1; + + enum ail_tiling tiling = hk_map_tiling(pCreateInfo, plane); + + image->planes[plane].layout = (struct ail_layout){ + .tiling = tiling, + .mipmapped_z = pCreateInfo->imageType == VK_IMAGE_TYPE_3D, + .format = vk_format_to_pipe_format(format), + + .width_px = pCreateInfo->extent.width / width_scale, + .height_px = pCreateInfo->extent.height / height_scale, + .depth_px = MAX2(pCreateInfo->extent.depth, pCreateInfo->arrayLayers), + + .levels = pCreateInfo->mipLevels, + .sample_count_sa = pCreateInfo->samples, + .writeable_image = tiling != AIL_TILING_TWIDDLED_COMPRESSED, + + /* TODO: Maybe optimize this, our GL driver doesn't bother though */ + .renderable = true, + }; + + ail_make_miptree(&image->planes[plane].layout); + } + + return VK_SUCCESS; +} + +static VkResult +hk_image_plane_alloc_vma(struct hk_device *dev, struct hk_image_plane *plane, + VkImageCreateFlags create_flags) +{ + const bool sparse_bound = create_flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT; + const bool sparse_resident = + create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT; + assert(sparse_bound || !sparse_resident); + + if (sparse_bound) { + plane->vma_size_B = plane->layout.size_B; +#if 0 + plane->addr = nouveau_ws_alloc_vma(dev->ws_dev, 0, plane->vma_size_B, + plane->layout.align_B, + false, sparse_resident); +#endif + if (plane->addr == 0) { + return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Sparse VMA allocation failed"); + } + } + + return VK_SUCCESS; +} + +static void +hk_image_plane_finish(struct hk_device *dev, struct hk_image_plane *plane, + VkImageCreateFlags create_flags, + const VkAllocationCallbacks *pAllocator) +{ + if (plane->vma_size_B) { +#if 0 + const bool sparse_resident = + create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT; + + agx_bo_unbind_vma(dev->ws_dev, plane->addr, plane->vma_size_B); + nouveau_ws_free_vma(dev->ws_dev, plane->addr, plane->vma_size_B, + false, sparse_resident); +#endif + } +} + +static void +hk_image_finish(struct hk_device *dev, struct hk_image *image, + const VkAllocationCallbacks *pAllocator) +{ + for (uint8_t plane = 0; plane < image->plane_count; plane++) { + hk_image_plane_finish(dev, &image->planes[plane], image->vk.create_flags, + pAllocator); + } + + vk_image_finish(&image->vk); +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateImage(VkDevice _device, const VkImageCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, VkImage *pImage) +{ + VK_FROM_HANDLE(hk_device, dev, _device); + struct hk_physical_device *pdev = hk_device_physical(dev); + struct hk_image *image; + VkResult result; + +#ifdef HK_USE_WSI_PLATFORM + /* Ignore swapchain creation info on Android. Since we don't have an + * implementation in Mesa, we're guaranteed to access an Android object + * incorrectly. + */ + const VkImageSwapchainCreateInfoKHR *swapchain_info = + vk_find_struct_const(pCreateInfo->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR); + if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) { + return wsi_common_create_swapchain_image( + &pdev->wsi_device, pCreateInfo, swapchain_info->swapchain, pImage); + } +#endif + + image = vk_zalloc2(&dev->vk.alloc, pAllocator, sizeof(*image), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!image) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + result = hk_image_init(dev, image, pCreateInfo); + if (result != VK_SUCCESS) { + vk_free2(&dev->vk.alloc, pAllocator, image); + return result; + } + + for (uint8_t plane = 0; plane < image->plane_count; plane++) { + result = hk_image_plane_alloc_vma(dev, &image->planes[plane], + image->vk.create_flags); + if (result != VK_SUCCESS) { + hk_image_finish(dev, image, pAllocator); + vk_free2(&dev->vk.alloc, pAllocator, image); + return result; + } + } + + *pImage = hk_image_to_handle(image); + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +hk_DestroyImage(VkDevice device, VkImage _image, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_image, image, _image); + + if (!image) + return; + + hk_image_finish(dev, image, pAllocator); + vk_free2(&dev->vk.alloc, pAllocator, image); +} + +static void +hk_image_plane_add_req(struct hk_image_plane *plane, uint64_t *size_B, + uint32_t *align_B) +{ + assert(util_is_power_of_two_or_zero64(*align_B)); + assert(util_is_power_of_two_or_zero64(HK_PLANE_ALIGN_B)); + + *align_B = MAX2(*align_B, HK_PLANE_ALIGN_B); + *size_B = align64(*size_B, HK_PLANE_ALIGN_B); + *size_B += plane->layout.size_B; +} + +static void +hk_get_image_memory_requirements(struct hk_device *dev, struct hk_image *image, + VkImageAspectFlags aspects, + VkMemoryRequirements2 *pMemoryRequirements) +{ + struct hk_physical_device *pdev = hk_device_physical(dev); + uint32_t memory_types = (1 << pdev->mem_type_count) - 1; + + // TODO hope for the best? + + uint64_t size_B = 0; + uint32_t align_B = 0; + if (image->disjoint) { + uint8_t plane = hk_image_aspects_to_plane(image, aspects); + hk_image_plane_add_req(&image->planes[plane], &size_B, &align_B); + } else { + for (unsigned plane = 0; plane < image->plane_count; plane++) + hk_image_plane_add_req(&image->planes[plane], &size_B, &align_B); + } + + pMemoryRequirements->memoryRequirements.memoryTypeBits = memory_types; + pMemoryRequirements->memoryRequirements.alignment = align_B; + pMemoryRequirements->memoryRequirements.size = size_B; + + vk_foreach_struct_const(ext, pMemoryRequirements->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: { + VkMemoryDedicatedRequirements *dedicated = (void *)ext; + dedicated->prefersDedicatedAllocation = false; + dedicated->requiresDedicatedAllocation = false; + break; + } + default: + vk_debug_ignored_stype(ext->sType); + break; + } + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetImageMemoryRequirements2(VkDevice device, + const VkImageMemoryRequirementsInfo2 *pInfo, + VkMemoryRequirements2 *pMemoryRequirements) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_image, image, pInfo->image); + + const VkImagePlaneMemoryRequirementsInfo *plane_info = + vk_find_struct_const(pInfo->pNext, IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO); + const VkImageAspectFlags aspects = + image->disjoint ? plane_info->planeAspect : image->vk.aspects; + + hk_get_image_memory_requirements(dev, image, aspects, pMemoryRequirements); +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetDeviceImageMemoryRequirements(VkDevice device, + const VkDeviceImageMemoryRequirements *pInfo, + VkMemoryRequirements2 *pMemoryRequirements) +{ + VK_FROM_HANDLE(hk_device, dev, device); + ASSERTED VkResult result; + struct hk_image image = {0}; + + result = hk_image_init(dev, &image, pInfo->pCreateInfo); + assert(result == VK_SUCCESS); + + const VkImageAspectFlags aspects = + image.disjoint ? pInfo->planeAspect : image.vk.aspects; + + hk_get_image_memory_requirements(dev, &image, aspects, pMemoryRequirements); + + hk_image_finish(dev, &image, NULL); +} + +static VkSparseImageMemoryRequirements +hk_fill_sparse_image_memory_reqs(const struct ail_layout *layout, + VkImageAspectFlags aspects) +{ + VkSparseImageFormatProperties sparse_format_props = + hk_fill_sparse_image_fmt_props(aspects); + + // assert(layout->mip_tail_first_lod <= layout->num_levels); + VkSparseImageMemoryRequirements sparse_memory_reqs = { + .formatProperties = sparse_format_props, + .imageMipTailFirstLod = 0, // layout->mip_tail_first_lod, + .imageMipTailStride = 0, + }; + + sparse_memory_reqs.imageMipTailSize = layout->size_B; + sparse_memory_reqs.imageMipTailOffset = 0; + return sparse_memory_reqs; +} + +static void +hk_get_image_sparse_memory_requirements( + struct hk_device *dev, struct hk_image *image, VkImageAspectFlags aspects, + uint32_t *pSparseMemoryRequirementCount, + VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements) +{ + VK_OUTARRAY_MAKE_TYPED(VkSparseImageMemoryRequirements2, out, + pSparseMemoryRequirements, + pSparseMemoryRequirementCount); + + /* From the Vulkan 1.3.279 spec: + * + * "The sparse image must have been created using the + * VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT flag to retrieve valid sparse + * image memory requirements." + */ + if (!(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT)) + return; + + /* We don't support multiplane sparse for now */ + if (image->plane_count > 1) + return; + + vk_outarray_append_typed(VkSparseImageMemoryRequirements2, &out, reqs) + { + reqs->memoryRequirements = + hk_fill_sparse_image_memory_reqs(&image->planes[0].layout, aspects); + }; +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetImageSparseMemoryRequirements2( + VkDevice device, const VkImageSparseMemoryRequirementsInfo2 *pInfo, + uint32_t *pSparseMemoryRequirementCount, + VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_image, image, pInfo->image); + + const VkImageAspectFlags aspects = image->vk.aspects; + + hk_get_image_sparse_memory_requirements(dev, image, aspects, + pSparseMemoryRequirementCount, + pSparseMemoryRequirements); +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetDeviceImageSparseMemoryRequirements( + VkDevice device, const VkDeviceImageMemoryRequirements *pInfo, + uint32_t *pSparseMemoryRequirementCount, + VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements) +{ + VK_FROM_HANDLE(hk_device, dev, device); + ASSERTED VkResult result; + struct hk_image image = {0}; + + result = hk_image_init(dev, &image, pInfo->pCreateInfo); + assert(result == VK_SUCCESS); + + const VkImageAspectFlags aspects = + image.disjoint ? pInfo->planeAspect : image.vk.aspects; + + hk_get_image_sparse_memory_requirements(dev, &image, aspects, + pSparseMemoryRequirementCount, + pSparseMemoryRequirements); + + hk_image_finish(dev, &image, NULL); +} + +static void +hk_get_image_subresource_layout(UNUSED struct hk_device *dev, + struct hk_image *image, + const VkImageSubresource2KHR *pSubresource, + VkSubresourceLayout2KHR *pLayout) +{ + const VkImageSubresource *isr = &pSubresource->imageSubresource; + + const uint8_t p = hk_image_aspects_to_plane(image, isr->aspectMask); + const struct hk_image_plane *plane = &image->planes[p]; + + uint64_t offset_B = 0; + if (!image->disjoint) { + uint32_t align_B = 0; + for (unsigned plane = 0; plane < p; plane++) + hk_image_plane_add_req(&image->planes[plane], &offset_B, &align_B); + } + offset_B += + ail_get_layer_level_B(&plane->layout, isr->arrayLayer, isr->mipLevel); + + bool is_3d = image->vk.image_type == VK_IMAGE_TYPE_3D; + + pLayout->subresourceLayout = (VkSubresourceLayout){ + .offset = offset_B, + .size = ail_get_level_size_B(&plane->layout, isr->mipLevel), + + /* From the spec: + * + * It is legal to call vkGetImageSubresourceLayout2KHR with a image + * created with tiling equal to VK_IMAGE_TILING_OPTIMAL, but the + * members of VkSubresourceLayout2KHR::subresourceLayout will have + * undefined values in this case. + * + * So don't collapse with mips. + */ + .rowPitch = isr->mipLevel + ? 0 + : ail_get_wsi_stride_B(&plane->layout, isr->mipLevel), + .arrayPitch = is_3d ? 0 : plane->layout.layer_stride_B, + .depthPitch = is_3d ? plane->layout.layer_stride_B : 0, + }; + + VkSubresourceHostMemcpySizeEXT *memcpy_size = + vk_find_struct(pLayout, SUBRESOURCE_HOST_MEMCPY_SIZE_EXT); + if (memcpy_size) { + memcpy_size->size = pLayout->subresourceLayout.size; + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetImageSubresourceLayout2KHR(VkDevice device, VkImage _image, + const VkImageSubresource2KHR *pSubresource, + VkSubresourceLayout2KHR *pLayout) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_image, image, _image); + + hk_get_image_subresource_layout(dev, image, pSubresource, pLayout); +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetDeviceImageSubresourceLayoutKHR( + VkDevice device, const VkDeviceImageSubresourceInfoKHR *pInfo, + VkSubresourceLayout2KHR *pLayout) +{ + VK_FROM_HANDLE(hk_device, dev, device); + ASSERTED VkResult result; + struct hk_image image = {0}; + + result = hk_image_init(dev, &image, pInfo->pCreateInfo); + assert(result == VK_SUCCESS); + + hk_get_image_subresource_layout(dev, &image, pInfo->pSubresource, pLayout); + + hk_image_finish(dev, &image, NULL); +} + +static void +hk_image_plane_bind(struct hk_device *dev, struct hk_image_plane *plane, + struct hk_device_memory *mem, uint64_t *offset_B) +{ + *offset_B = align64(*offset_B, HK_PLANE_ALIGN_B); + + if (plane->vma_size_B) { +#if 0 + agx_bo_bind_vma(dev->ws_dev, + mem->bo, + plane->addr, + plane->vma_size_B, + *offset_B, + plane->nil.pte_kind); +#endif + unreachable("todo"); + } else { + plane->addr = mem->bo->ptr.gpu + *offset_B; + plane->map = mem->bo->ptr.cpu + *offset_B; + plane->rem = mem->bo->size - (*offset_B); + } + + *offset_B += plane->layout.size_B; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_BindImageMemory2(VkDevice device, uint32_t bindInfoCount, + const VkBindImageMemoryInfo *pBindInfos) +{ + VK_FROM_HANDLE(hk_device, dev, device); + for (uint32_t i = 0; i < bindInfoCount; ++i) { + VK_FROM_HANDLE(hk_device_memory, mem, pBindInfos[i].memory); + VK_FROM_HANDLE(hk_image, image, pBindInfos[i].image); + + /* Ignore this struct on Android, we cannot access swapchain structures + * there. */ +#ifdef HK_USE_WSI_PLATFORM + const VkBindImageMemorySwapchainInfoKHR *swapchain_info = + vk_find_struct_const(pBindInfos[i].pNext, + BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR); + + if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) { + VkImage _wsi_image = wsi_common_get_image(swapchain_info->swapchain, + swapchain_info->imageIndex); + VK_FROM_HANDLE(hk_image, wsi_img, _wsi_image); + + assert(image->plane_count == 1); + assert(wsi_img->plane_count == 1); + + struct hk_image_plane *plane = &image->planes[0]; + struct hk_image_plane *swapchain_plane = &wsi_img->planes[0]; + + /* Copy memory binding information from swapchain image to the current + * image's plane. */ + plane->addr = swapchain_plane->addr; + continue; + } +#endif + + uint64_t offset_B = pBindInfos[i].memoryOffset; + if (image->disjoint) { + const VkBindImagePlaneMemoryInfo *plane_info = vk_find_struct_const( + pBindInfos[i].pNext, BIND_IMAGE_PLANE_MEMORY_INFO); + uint8_t plane = + hk_image_aspects_to_plane(image, plane_info->planeAspect); + hk_image_plane_bind(dev, &image->planes[plane], mem, &offset_B); + } else { + for (unsigned plane = 0; plane < image->plane_count; plane++) { + hk_image_plane_bind(dev, &image->planes[plane], mem, &offset_B); + } + } + + const VkBindMemoryStatusKHR *status = + vk_find_struct_const(pBindInfos[i].pNext, BIND_MEMORY_STATUS_KHR); + if (status != NULL && status->pResult != NULL) + *status->pResult = VK_SUCCESS; + } + + return VK_SUCCESS; +} + +static uint32_t +hk_plane_index(VkFormat format, VkImageAspectFlags aspect_mask) +{ + switch (aspect_mask) { + default: + assert(aspect_mask != VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT); + return 0; + case VK_IMAGE_ASPECT_PLANE_1_BIT: + case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT: + return 1; + case VK_IMAGE_ASPECT_PLANE_2_BIT: + case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT: + return 2; + case VK_IMAGE_ASPECT_STENCIL_BIT: + return format == VK_FORMAT_D32_SFLOAT_S8_UINT; + } +} + +static void +hk_copy_memory_to_image(struct hk_device *device, struct hk_image *dst_image, + const VkMemoryToImageCopyEXT *info, bool copy_memcpy) +{ + unsigned plane = + hk_plane_index(dst_image->vk.format, info->imageSubresource.aspectMask); + const struct ail_layout *layout = &dst_image->planes[plane].layout; + + VkOffset3D offset = info->imageOffset; + VkExtent3D extent = info->imageExtent; + uint32_t src_width = info->memoryRowLength ?: extent.width; + uint32_t src_height = info->memoryImageHeight ?: extent.height; + + uint32_t blocksize_B = util_format_get_blocksize(layout->format); + uint32_t src_pitch = src_width * blocksize_B; + + unsigned start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) + ? offset.z + : info->imageSubresource.baseArrayLayer; + uint32_t layers = + MAX2(extent.depth, vk_image_subresource_layer_count( + &dst_image->vk, &info->imageSubresource)); + + unsigned level = info->imageSubresource.mipLevel; + uint32_t image_offset = ail_get_layer_level_B(layout, start_layer, level); + uint32_t dst_layer_stride = layout->layer_stride_B; + uint32_t src_layer_stride = copy_memcpy + ? ail_get_level_size_B(layout, level) + : (src_width * src_height * blocksize_B); + bool tiled = ail_is_level_twiddled_uncompressed( + layout, info->imageSubresource.mipLevel); + + const char *src = + (const char *)info->pHostPointer + start_layer * dst_layer_stride; + char *dst = (char *)dst_image->planes[plane].map + image_offset; + for (unsigned layer = 0; layer < layers; + layer++, src += src_layer_stride, dst += dst_layer_stride) { + if (copy_memcpy) { + memcpy(dst, src, ail_get_level_size_B(layout, level)); + } else if (!tiled) { + uint32_t dst_pitch = ail_get_linear_stride_B(layout, level); + /*TODO:comp*/ + for (unsigned y = 0; y < extent.height; y++) { + memcpy(dst + dst_pitch * (y + offset.y) + offset.x * blocksize_B, + src + src_pitch * y, extent.width * blocksize_B); + } + } else { + ail_tile(dst, (void *)src, layout, level, src_pitch, offset.x, + offset.y, extent.width, extent.height); + } + } +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CopyMemoryToImageEXT(VkDevice _device, + const VkCopyMemoryToImageInfoEXT *info) +{ + VK_FROM_HANDLE(hk_device, device, _device); + VK_FROM_HANDLE(hk_image, dst_image, info->dstImage); + + for (unsigned i = 0; i < info->regionCount; i++) { + hk_copy_memory_to_image(device, dst_image, &info->pRegions[i], + info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT); + } + + return VK_SUCCESS; +} + +static void +hk_copy_image_to_memory(struct hk_device *device, struct hk_image *src_image, + const VkImageToMemoryCopyEXT *info, bool copy_memcpy) +{ + unsigned plane = + hk_plane_index(src_image->vk.format, info->imageSubresource.aspectMask); + const struct ail_layout *layout = &src_image->planes[plane].layout; + + VkOffset3D offset = info->imageOffset; + VkExtent3D extent = info->imageExtent; + uint32_t dst_width = info->memoryRowLength ?: extent.width; + uint32_t dst_height = info->memoryImageHeight ?: extent.height; + +#if 0 + copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, + &dst_height); +#endif + + uint32_t blocksize_B = util_format_get_blocksize(layout->format); + uint32_t dst_pitch = dst_width * blocksize_B; + + unsigned start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D) + ? offset.z + : info->imageSubresource.baseArrayLayer; + uint32_t layers = + MAX2(extent.depth, vk_image_subresource_layer_count( + &src_image->vk, &info->imageSubresource)); + unsigned level = info->imageSubresource.mipLevel; + + uint32_t image_offset = ail_get_layer_level_B(layout, start_layer, level); + uint32_t src_layer_stride = layout->layer_stride_B; + uint32_t dst_layer_stride = copy_memcpy + ? ail_get_level_size_B(layout, level) + : (dst_width * dst_height * blocksize_B); + + bool tiled = ail_is_level_twiddled_uncompressed( + layout, info->imageSubresource.mipLevel); + + const char *src = (const char *)src_image->planes[plane].map + image_offset; + char *dst = (char *)info->pHostPointer + start_layer * dst_layer_stride; + for (unsigned layer = 0; layer < layers; + layer++, src += src_layer_stride, dst += dst_layer_stride) { + + if (copy_memcpy) { + memcpy(dst, src, dst_layer_stride); + } else if (!tiled) { + /* TODO: comp */ + uint32_t src_pitch = ail_get_linear_stride_B(layout, level); + for (unsigned y = 0; y < extent.height; y++) { + memcpy(dst + dst_pitch * y, + src + src_pitch * (y + offset.y) + offset.x * blocksize_B, + extent.width * blocksize_B); + } + } else { + ail_detile((void *)src, dst, layout, info->imageSubresource.mipLevel, + dst_pitch, offset.x, offset.y, extent.width, extent.height); + } + } +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CopyImageToMemoryEXT(VkDevice _device, + const VkCopyImageToMemoryInfoEXT *info) +{ + VK_FROM_HANDLE(hk_device, device, _device); + VK_FROM_HANDLE(hk_image, image, info->srcImage); + + for (unsigned i = 0; i < info->regionCount; i++) { + hk_copy_image_to_memory(device, image, &info->pRegions[i], + info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT); + } + + return VK_SUCCESS; +} + +static void +hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image, + struct hk_image *dst_image, const VkImageCopy2 *info, + bool copy_memcpy) +{ + unsigned src_plane = + hk_plane_index(src_image->vk.format, info->srcSubresource.aspectMask); + unsigned dst_plane = + hk_plane_index(dst_image->vk.format, info->dstSubresource.aspectMask); + + const struct ail_layout *src_layout = &src_image->planes[src_plane].layout; + const struct ail_layout *dst_layout = &dst_image->planes[dst_plane].layout; + + VkOffset3D src_offset = info->srcOffset; + VkOffset3D dst_offset = info->dstOffset; + VkExtent3D extent = info->extent; + uint32_t layers_to_copy = MAX2( + info->extent.depth, + vk_image_subresource_layer_count(&src_image->vk, &info->srcSubresource)); + + /* See comment above. */ +#if 0 + copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL); + copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL); +#endif + + unsigned src_start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D) + ? src_offset.z + : info->srcSubresource.baseArrayLayer; + unsigned dst_start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) + ? dst_offset.z + : info->dstSubresource.baseArrayLayer; + + uint32_t src_layer_stride = src_layout->layer_stride_B; + uint32_t dst_layer_stride = dst_layout->layer_stride_B; + + uint32_t dst_block_B = util_format_get_blocksize(dst_layout->format); + uint32_t src_block_B = util_format_get_blocksize(src_layout->format); + + uint32_t src_image_offset = ail_get_layer_level_B( + src_layout, src_start_layer, info->srcSubresource.mipLevel); + uint32_t dst_image_offset = ail_get_layer_level_B( + dst_layout, dst_start_layer, info->dstSubresource.mipLevel); + + bool src_tiled = ail_is_level_twiddled_uncompressed( + src_layout, info->srcSubresource.mipLevel); + bool dst_tiled = ail_is_level_twiddled_uncompressed( + dst_layout, info->dstSubresource.mipLevel); + + const char *src = + (const char *)src_image->planes[src_plane].map + src_image_offset; + char *dst = (char *)dst_image->planes[dst_plane].map + dst_image_offset; + for (unsigned layer = 0; layer < layers_to_copy; + layer++, src += src_layer_stride, dst += dst_layer_stride) { + + if (copy_memcpy) { + uint32_t src_size = + ail_get_level_size_B(src_layout, info->srcSubresource.mipLevel); + uint32_t dst_size = + ail_get_level_size_B(dst_layout, info->dstSubresource.mipLevel); + + assert(src_size == dst_size); + memcpy(dst, src, src_size); + } else if (!src_tiled && !dst_tiled) { + /* TODO comp */ + uint32_t src_pitch = + ail_get_linear_stride_B(src_layout, info->srcSubresource.mipLevel); + + uint32_t dst_pitch = + ail_get_linear_stride_B(dst_layout, info->dstSubresource.mipLevel); + + for (unsigned y = 0; y < extent.height; y++) { + memcpy(dst + dst_pitch * (y + dst_offset.y) + + dst_offset.x * dst_block_B, + src + src_pitch * (y + src_offset.y) + + src_offset.x * src_block_B, + extent.width * src_block_B); + } + } else if (!src_tiled) { + unreachable("todo"); +#if 0 + fdl6_memcpy_linear_to_tiled( + dst_offset.x, dst_offset.y, extent.width, extent.height, dst, + src + src_pitch * src_offset.y + src_offset.x * src_layout->cpp, + dst_layout, info->dstSubresource.mipLevel, src_pitch, + &device->physical_device->ubwc_config); +#endif + } else if (!dst_tiled) { + unreachable("todo"); +#if 0 + fdl6_memcpy_tiled_to_linear( + src_offset.x, src_offset.y, extent.width, extent.height, + dst + dst_pitch * dst_offset.y + dst_offset.x * dst_layout->cpp, + src, src_layout, info->dstSubresource.mipLevel, dst_pitch, + &device->physical_device->ubwc_config); +#endif + } else { + /* Work tile-by-tile, holding the unswizzled tile in a temporary + * buffer. + */ + char temp_tile[16384]; + + unsigned src_level = info->srcSubresource.mipLevel; + unsigned dst_level = info->dstSubresource.mipLevel; + uint32_t block_width = src_layout->tilesize_el[src_level].width_el; + uint32_t block_height = src_layout->tilesize_el[src_level].height_el; + uint32_t temp_pitch = block_width * src_block_B; + ; + + for (unsigned by = src_offset.y / block_height; + by * block_height < src_offset.y + extent.height; by++) { + uint32_t src_y_start = MAX2(src_offset.y, by * block_height); + uint32_t dst_y_start = src_y_start - src_offset.y + dst_offset.y; + uint32_t height = + MIN2((by + 1) * block_height, src_offset.y + extent.height) - + src_y_start; + for (unsigned bx = src_offset.x / block_width; + bx * block_width < src_offset.x + extent.width; bx++) { + uint32_t src_x_start = MAX2(src_offset.x, bx * block_width); + uint32_t dst_x_start = src_x_start - src_offset.x + dst_offset.x; + uint32_t width = + MIN2((bx + 1) * block_width, src_offset.x + extent.width) - + src_x_start; + + ail_detile((void *)src, temp_tile, src_layout, src_level, + temp_pitch, src_x_start, src_y_start, width, height); + ail_tile(dst, temp_tile, dst_layout, dst_level, temp_pitch, + dst_x_start, dst_y_start, width, height); + } + } + } + } +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CopyImageToImageEXT(VkDevice _device, + const VkCopyImageToImageInfoEXT *pCopyImageToImageInfo) +{ + VK_FROM_HANDLE(hk_device, device, _device); + VK_FROM_HANDLE(hk_image, src_image, pCopyImageToImageInfo->srcImage); + VK_FROM_HANDLE(hk_image, dst_image, pCopyImageToImageInfo->dstImage); + bool copy_memcpy = + pCopyImageToImageInfo->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT; + + for (uint32_t i = 0; i < pCopyImageToImageInfo->regionCount; ++i) { + if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + VkImageCopy2 info = pCopyImageToImageInfo->pRegions[i]; + u_foreach_bit(b, info.dstSubresource.aspectMask) { + info.srcSubresource.aspectMask = BITFIELD_BIT(b); + info.dstSubresource.aspectMask = BITFIELD_BIT(b); + hk_copy_image_to_image_cpu(device, src_image, dst_image, &info, + copy_memcpy); + } + continue; + } + + hk_copy_image_to_image_cpu(device, src_image, dst_image, + pCopyImageToImageInfo->pRegions + i, + copy_memcpy); + } + + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_TransitionImageLayoutEXT( + VkDevice device, uint32_t transitionCount, + const VkHostImageLayoutTransitionInfoEXT *transitions) +{ + /* We don't do anything with layouts so this should be a no-op */ + return VK_SUCCESS; +} diff --git a/src/asahi/vulkan/hk_image.h b/src/asahi/vulkan/hk_image.h new file mode 100644 index 00000000000..a15129032aa --- /dev/null +++ b/src/asahi/vulkan/hk_image.h @@ -0,0 +1,115 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "asahi/layout/layout.h" +#include "vulkan/vulkan_core.h" + +#include "hk_private.h" + +#include "vk_image.h" + +/* Because small images can end up with an array_stride_B that is less than + * the sparse block size (in bytes), we have to set SINGLE_MIPTAIL_BIT when + * advertising sparse properties to the client. This means that we get one + * single memory range for the miptail of the image. For large images with + * mipTailStartLod > 0, we have to deal with the array stride ourselves. + * + * We do this by returning HK_MIP_TAIL_START_OFFSET as the image's + * imageMipTailOffset. We can then detect anything with that address as + * being part of the miptail and re-map it accordingly. The Vulkan spec + * explicitly allows for this. + * + * From the Vulkan 1.3.279 spec: + * + * "When VK_SPARSE_MEMORY_BIND_METADATA_BIT is present, the resourceOffset + * must have been derived explicitly from the imageMipTailOffset in the + * sparse resource properties returned for the metadata aspect. By + * manipulating the value returned for imageMipTailOffset, the + * resourceOffset does not have to correlate directly to a device virtual + * address offset, and may instead be whatever value makes it easiest for + * the implementation to derive the correct device virtual address." + */ +#define HK_MIP_TAIL_START_OFFSET 0x6d74000000000000UL + +struct hk_device_memory; +struct hk_physical_device; + +static VkFormatFeatureFlags2 +hk_get_image_plane_format_features(struct hk_physical_device *pdev, + VkFormat vk_format, VkImageTiling tiling); + +VkFormatFeatureFlags2 +hk_get_image_format_features(struct hk_physical_device *pdevice, + VkFormat format, VkImageTiling tiling); + +struct hk_image_plane { + struct ail_layout layout; + uint64_t addr; + + /** Size of the reserved VMA range for sparse images, zero otherwise. */ + uint64_t vma_size_B; + + /* For host image copy */ + void *map; + uint32_t rem; +}; + +struct hk_image { + struct vk_image vk; + + /** True if the planes are bound separately + * + * This is set based on VK_IMAGE_CREATE_DISJOINT_BIT + */ + bool disjoint; + + uint8_t plane_count; + struct hk_image_plane planes[3]; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE) + +static inline uint64_t +hk_image_plane_base_address(const struct hk_image_plane *plane) +{ + return plane->addr; +} + +static inline uint64_t +hk_image_base_address(const struct hk_image *image, uint8_t plane) +{ + return hk_image_plane_base_address(&image->planes[plane]); +} + +static inline uint8_t +hk_image_aspects_to_plane(const struct hk_image *image, + VkImageAspectFlags aspectMask) +{ + /* Must only be one aspect unless it's depth/stencil */ + assert(aspectMask == + (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) || + util_bitcount(aspectMask) == 1); + + switch (aspectMask) { + default: + assert(aspectMask != VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT); + return 0; + + case VK_IMAGE_ASPECT_STENCIL_BIT: + return image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT; + + case VK_IMAGE_ASPECT_PLANE_1_BIT: + case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT: + return 1; + + case VK_IMAGE_ASPECT_PLANE_2_BIT: + case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT: + return 2; + } +} diff --git a/src/asahi/vulkan/hk_image_view.c b/src/asahi/vulkan/hk_image_view.c new file mode 100644 index 00000000000..5a78224a4fd --- /dev/null +++ b/src/asahi/vulkan/hk_image_view.c @@ -0,0 +1,653 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_image_view.h" +#include "util/format/u_format.h" +#include "vulkan/vulkan_core.h" + +#include "agx_helpers.h" +#include "agx_nir_passes.h" +#include "agx_pack.h" +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_image.h" +#include "hk_physical_device.h" + +#include "layout.h" +#include "vk_format.h" + +enum hk_desc_usage { + HK_DESC_USAGE_SAMPLED, + HK_DESC_USAGE_STORAGE, + HK_DESC_USAGE_INPUT, + HK_DESC_USAGE_BG_EOT, + HK_DESC_USAGE_LAYERED_BG_EOT, + HK_DESC_USAGE_EMRT, +}; + +static bool +hk_image_view_type_is_array(VkImageViewType view_type) +{ + switch (view_type) { + case VK_IMAGE_VIEW_TYPE_1D: + case VK_IMAGE_VIEW_TYPE_2D: + case VK_IMAGE_VIEW_TYPE_3D: + case VK_IMAGE_VIEW_TYPE_CUBE: + return false; + + case VK_IMAGE_VIEW_TYPE_1D_ARRAY: + case VK_IMAGE_VIEW_TYPE_2D_ARRAY: + case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY: + return true; + + default: + unreachable("Invalid image view type"); + } +} + +static enum agx_texture_dimension +translate_image_view_type(VkImageViewType view_type, bool msaa, bool layered, + enum hk_desc_usage usage) +{ + if (usage == HK_DESC_USAGE_EMRT || usage == HK_DESC_USAGE_INPUT || + (usage == HK_DESC_USAGE_LAYERED_BG_EOT && layered)) { + return msaa ? AGX_TEXTURE_DIMENSION_2D_ARRAY_MULTISAMPLED + : AGX_TEXTURE_DIMENSION_2D_ARRAY; + } + + /* For background/EOT, we ignore the application-provided view type */ + if (usage == HK_DESC_USAGE_BG_EOT || usage == HK_DESC_USAGE_LAYERED_BG_EOT) { + return msaa ? AGX_TEXTURE_DIMENSION_2D_MULTISAMPLED + : AGX_TEXTURE_DIMENSION_2D; + } + + bool cubes_to_2d = usage != HK_DESC_USAGE_SAMPLED; + + switch (view_type) { + case VK_IMAGE_VIEW_TYPE_1D: + case VK_IMAGE_VIEW_TYPE_2D: + return msaa ? AGX_TEXTURE_DIMENSION_2D_MULTISAMPLED + : AGX_TEXTURE_DIMENSION_2D; + + case VK_IMAGE_VIEW_TYPE_1D_ARRAY: + case VK_IMAGE_VIEW_TYPE_2D_ARRAY: + return msaa ? AGX_TEXTURE_DIMENSION_2D_ARRAY_MULTISAMPLED + : AGX_TEXTURE_DIMENSION_2D_ARRAY; + + case VK_IMAGE_VIEW_TYPE_3D: + assert(!msaa); + return AGX_TEXTURE_DIMENSION_3D; + + case VK_IMAGE_VIEW_TYPE_CUBE: + assert(!msaa); + return cubes_to_2d ? AGX_TEXTURE_DIMENSION_2D_ARRAY + : AGX_TEXTURE_DIMENSION_CUBE; + + case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY: + assert(!msaa); + return cubes_to_2d ? AGX_TEXTURE_DIMENSION_2D_ARRAY + : AGX_TEXTURE_DIMENSION_CUBE_ARRAY; + + default: + unreachable("Invalid image view type"); + } +} + +static enum pipe_swizzle +vk_swizzle_to_pipe(VkComponentSwizzle swizzle) +{ + switch (swizzle) { + case VK_COMPONENT_SWIZZLE_R: + return PIPE_SWIZZLE_X; + case VK_COMPONENT_SWIZZLE_G: + return PIPE_SWIZZLE_Y; + case VK_COMPONENT_SWIZZLE_B: + return PIPE_SWIZZLE_Z; + case VK_COMPONENT_SWIZZLE_A: + return PIPE_SWIZZLE_W; + case VK_COMPONENT_SWIZZLE_ONE: + return PIPE_SWIZZLE_1; + case VK_COMPONENT_SWIZZLE_ZERO: + return PIPE_SWIZZLE_0; + default: + unreachable("Invalid component swizzle"); + } +} + +static enum pipe_format +get_stencil_format(enum pipe_format format) +{ + switch (format) { + case PIPE_FORMAT_S8_UINT: + return PIPE_FORMAT_S8_UINT; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return PIPE_FORMAT_X24S8_UINT; + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + return PIPE_FORMAT_S8X24_UINT; + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return PIPE_FORMAT_X32_S8X24_UINT; + default: + unreachable("Unsupported depth/stencil format"); + } +} + +struct hk_3d { + unsigned x, y, z; +}; + +static struct hk_3d +view_denominator(struct hk_image_view *view) +{ + enum pipe_format view_format = vk_format_to_pipe_format(view->vk.format); + enum pipe_format img_format = + vk_format_to_pipe_format(view->vk.image->format); + + if (util_format_is_compressed(view_format)) { + /* + * We can do an uncompressed view of a compressed image but not the other + * way around. + */ + assert(util_format_is_compressed(img_format)); + assert(util_format_get_blockwidth(img_format) == + util_format_get_blockwidth(view_format)); + assert(util_format_get_blockheight(img_format) == + util_format_get_blockheight(view_format)); + assert(util_format_get_blockdepth(img_format) == + util_format_get_blockdepth(view_format)); + + return (struct hk_3d){1, 1, 1}; + } + + if (!util_format_is_compressed(img_format)) { + /* Both formats uncompressed */ + return (struct hk_3d){1, 1, 1}; + } + + /* Else, img is compressed but view is not */ + return (struct hk_3d){ + util_format_get_blockwidth(img_format), + util_format_get_blockheight(img_format), + util_format_get_blockdepth(img_format), + }; +} + +static enum pipe_format +format_for_plane(struct hk_image_view *view, unsigned view_plane) +{ + const struct vk_format_ycbcr_info *ycbcr_info = + vk_format_get_ycbcr_info(view->vk.format); + + assert(ycbcr_info || view_plane == 0); + VkFormat plane_format = + ycbcr_info ? ycbcr_info->planes[view_plane].format : view->vk.format; + + enum pipe_format p_format = vk_format_to_pipe_format(plane_format); + if (view->vk.aspects == VK_IMAGE_ASPECT_STENCIL_BIT) + p_format = get_stencil_format(p_format); + + return p_format; +} + +static void +pack_texture(struct hk_image_view *view, unsigned view_plane, + enum hk_desc_usage usage, struct agx_texture_packed *out) +{ + struct hk_image *image = container_of(view->vk.image, struct hk_image, vk); + const uint8_t image_plane = view->planes[view_plane].image_plane; + struct ail_layout *layout = &image->planes[image_plane].layout; + uint64_t base_addr = hk_image_base_address(image, image_plane); + + bool cubes_to_2d = usage != HK_DESC_USAGE_SAMPLED; + + unsigned level = view->vk.base_mip_level; + unsigned layer = view->vk.base_array_layer; + + enum pipe_format p_format = format_for_plane(view, view_plane); + const struct util_format_description *desc = + util_format_description(p_format); + + struct hk_3d denom = view_denominator(view); + + uint8_t format_swizzle[4] = { + desc->swizzle[0], + desc->swizzle[1], + desc->swizzle[2], + desc->swizzle[3], + }; + + /* Different APIs have different depth/stencil swizzle rules. Vulkan expects + * R001 behaviour, override here because Mesa's format table is not that. + */ + if (util_format_is_depth_or_stencil(p_format)) { + format_swizzle[0] = PIPE_SWIZZLE_X; + format_swizzle[1] = PIPE_SWIZZLE_0; + format_swizzle[2] = PIPE_SWIZZLE_0; + format_swizzle[3] = PIPE_SWIZZLE_1; + } + + /* We only have a single swizzle for the user swizzle and the format + * fixup, so compose them now. + */ + uint8_t out_swizzle[4]; + uint8_t view_swizzle[4] = { + vk_swizzle_to_pipe(view->vk.swizzle.r), + vk_swizzle_to_pipe(view->vk.swizzle.g), + vk_swizzle_to_pipe(view->vk.swizzle.b), + vk_swizzle_to_pipe(view->vk.swizzle.a), + }; + + unsigned layers = view->vk.layer_count; + if (view->vk.view_type == VK_IMAGE_VIEW_TYPE_3D) { + layers = DIV_ROUND_UP(layout->depth_px, denom.z); + } else if (!cubes_to_2d && + (view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE || + view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)) { + + layers /= 6; + } + + util_format_compose_swizzles(format_swizzle, view_swizzle, out_swizzle); + + agx_pack(out, TEXTURE, cfg) { + cfg.dimension = translate_image_view_type( + view->vk.view_type, view->vk.image->samples > 1, layers > 1, usage); + cfg.layout = agx_translate_layout(layout->tiling); + cfg.channels = agx_pixel_format[p_format].channels; + cfg.type = agx_pixel_format[p_format].type; + cfg.srgb = util_format_is_srgb(p_format); + + cfg.swizzle_r = agx_channel_from_pipe(out_swizzle[0]); + cfg.swizzle_g = agx_channel_from_pipe(out_swizzle[1]); + cfg.swizzle_b = agx_channel_from_pipe(out_swizzle[2]); + cfg.swizzle_a = agx_channel_from_pipe(out_swizzle[3]); + + if (denom.x > 1) { + assert(view->vk.level_count == 1); + assert(view->vk.layer_count == 1); + + cfg.address = base_addr + ail_get_layer_level_B(layout, layer, level); + cfg.width = DIV_ROUND_UP(u_minify(layout->width_px, level), denom.x); + cfg.height = DIV_ROUND_UP(u_minify(layout->height_px, level), denom.y); + cfg.first_level = 0; + cfg.last_level = 1; + } else { + cfg.address = base_addr + ail_get_layer_offset_B(layout, layer); + cfg.width = layout->width_px; + cfg.height = layout->height_px; + cfg.first_level = level; + cfg.last_level = level + view->vk.level_count - 1; + } + + cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); + cfg.unk_mipmapped = layout->levels > 1; + cfg.srgb_2_channel = cfg.srgb && util_format_colormask(desc) == 0x3; + + if (ail_is_compressed(layout)) { + cfg.compressed_1 = true; + cfg.extended = true; + } + + if (ail_is_compressed(layout)) { + cfg.acceleration_buffer = base_addr + layout->metadata_offset_B + + (layer * layout->compression_layer_stride_B); + } + + if (layout->tiling == AIL_TILING_LINEAR && + (hk_image_view_type_is_array(view->vk.view_type))) { + + cfg.depth_linear = layers; + cfg.layer_stride_linear = layout->layer_stride_B - 0x80; + cfg.extended = true; + } else { + assert((layout->tiling != AIL_TILING_LINEAR) || (layers == 1)); + cfg.depth = layers; + } + + if (view->vk.image->samples > 1) { + cfg.samples = agx_translate_sample_count(view->vk.image->samples); + } + + if (layout->tiling == AIL_TILING_LINEAR) { + cfg.stride = ail_get_linear_stride_B(layout, 0) - 16; + } else { + assert(layout->tiling == AIL_TILING_TWIDDLED || + layout->tiling == AIL_TILING_TWIDDLED_COMPRESSED); + + cfg.page_aligned_layers = layout->page_aligned_layers; + } + } +} + +static void +pack_pbe(struct hk_device *dev, struct hk_image_view *view, unsigned view_plane, + enum hk_desc_usage usage, struct agx_pbe_packed *out) +{ + struct hk_image *image = container_of(view->vk.image, struct hk_image, vk); + const uint8_t image_plane = view->planes[view_plane].image_plane; + struct ail_layout *layout = &image->planes[image_plane].layout; + uint64_t base_addr = hk_image_base_address(image, image_plane); + + unsigned level = view->vk.base_mip_level; + unsigned layer = view->vk.base_array_layer; + + enum pipe_format p_format = format_for_plane(view, view_plane); + const struct util_format_description *desc = + util_format_description(p_format); + + bool eot = + usage == HK_DESC_USAGE_BG_EOT || usage == HK_DESC_USAGE_LAYERED_BG_EOT; + + /* The tilebuffer is already in sRGB space if needed. Do not convert for + * end-of-tile descriptors. + */ + if (eot) + p_format = util_format_linear(p_format); + + bool msaa = view->vk.image->samples > 1; + struct hk_3d denom = view_denominator(view); + + unsigned layers = view->vk.view_type == VK_IMAGE_VIEW_TYPE_3D + ? image->vk.extent.depth + : view->vk.layer_count; + + agx_pack(out, PBE, cfg) { + cfg.dimension = + translate_image_view_type(view->vk.view_type, msaa, layers > 1, usage); + cfg.layout = agx_translate_layout(layout->tiling); + cfg.channels = agx_pixel_format[p_format].channels; + cfg.type = agx_pixel_format[p_format].type; + cfg.srgb = util_format_is_srgb(p_format); + + assert(desc->nr_channels >= 1 && desc->nr_channels <= 4); + + for (unsigned i = 0; i < desc->nr_channels; ++i) { + if (desc->swizzle[i] == 0) + cfg.swizzle_r = i; + else if (desc->swizzle[i] == 1) + cfg.swizzle_g = i; + else if (desc->swizzle[i] == 2) + cfg.swizzle_b = i; + else if (desc->swizzle[i] == 3) + cfg.swizzle_a = i; + } + + cfg.buffer = base_addr + ail_get_layer_offset_B(layout, layer); + cfg.unk_mipmapped = layout->levels > 1; + + if (msaa & !eot) { + /* Multisampled images are bound like buffer textures, with + * addressing arithmetic to determine the texel to write. + * + * Note that the end-of-tile program uses real multisample images + * with image_write_block instructions. + */ + unsigned blocksize_B = util_format_get_blocksize(p_format); + unsigned size_px = + (layout->size_B - layout->layer_stride_B * layer) / blocksize_B; + + cfg.dimension = AGX_TEXTURE_DIMENSION_2D; + cfg.layout = AGX_LAYOUT_LINEAR; + cfg.width = AGX_TEXTURE_BUFFER_WIDTH; + cfg.height = DIV_ROUND_UP(size_px, cfg.width); + cfg.stride = (cfg.width * blocksize_B) - 4; + cfg.layers = 1; + cfg.levels = 1; + + cfg.buffer += layout->level_offsets_B[level]; + cfg.level = 0; + } else { + if (denom.x > 1) { + assert(denom.z == 1 && "todo how to handle?"); + assert(view->vk.level_count == 1); + assert(view->vk.layer_count == 1); + + cfg.buffer = + base_addr + ail_get_layer_level_B(layout, layer, level); + cfg.width = + DIV_ROUND_UP(u_minify(layout->width_px, level), denom.x); + cfg.height = + DIV_ROUND_UP(u_minify(layout->height_px, level), denom.y); + cfg.level = 0; + } else { + cfg.buffer = base_addr + ail_get_layer_offset_B(layout, layer); + cfg.width = layout->width_px; + cfg.height = layout->height_px; + cfg.level = level; + } + + if (layout->tiling == AIL_TILING_LINEAR && + (hk_image_view_type_is_array(view->vk.view_type))) { + + cfg.depth_linear = layers; + cfg.layer_stride_linear = (layout->layer_stride_B - 0x80); + cfg.extended = true; + } else { + assert((layout->tiling != AIL_TILING_LINEAR) || (layers == 1)); + cfg.layers = layers; + } + + cfg.levels = image->vk.mip_levels; + + if (layout->tiling == AIL_TILING_LINEAR) { + cfg.stride = ail_get_linear_stride_B(layout, level) - 4; + assert(cfg.levels == 1); + } else { + cfg.page_aligned_layers = layout->page_aligned_layers; + } + + if (image->vk.samples > 1) + cfg.samples = agx_translate_sample_count(image->vk.samples); + } + + if (ail_is_compressed(layout)) { + cfg.compressed_1 = true; + cfg.extended = true; + + cfg.acceleration_buffer = base_addr + layout->metadata_offset_B + + (layer * layout->compression_layer_stride_B); + } + + /* When the descriptor isn't extended architecturally, we use + * the last 8 bytes as a sideband to accelerate image atomics. + */ + if (!cfg.extended && layout->writeable_image) { + if (msaa) { + assert(denom.x == 1 && "no MSAA of block-compressed"); + + cfg.aligned_width_msaa_sw = + align(u_minify(layout->width_px, level), + layout->tilesize_el[level].width_el); + } else { + cfg.level_offset_sw = ail_get_level_offset_B(layout, cfg.level); + } + + cfg.sample_count_log2_sw = util_logbase2(image->vk.samples); + + if (layout->tiling == AIL_TILING_TWIDDLED) { + struct ail_tile tile_size = layout->tilesize_el[level]; + cfg.tile_width_sw = tile_size.width_el; + cfg.tile_height_sw = tile_size.height_el; + + cfg.layer_stride_sw = layout->layer_stride_B; + } + } + }; +} + +static VkResult +add_descriptor(struct hk_device *dev, struct hk_image_view *view, + struct agx_texture_packed *desc, + struct agx_texture_packed *cached, uint32_t *index) +{ + /* First, look for a descriptor we already uploaded */ + for (unsigned i = 0; i < view->descriptor_count; ++i) { + if (memcmp(&cached[i], desc, sizeof *desc) == 0) { + *index = view->descriptor_index[i]; + return VK_SUCCESS; + } + } + + /* Else, add a new descriptor */ + VkResult result = + hk_descriptor_table_add(dev, &dev->images, desc, sizeof *desc, index); + if (result != VK_SUCCESS) + return result; + + uint32_t local_index = view->descriptor_count++; + assert(local_index < HK_MAX_IMAGE_DESCS); + + cached[local_index] = *desc; + view->descriptor_index[local_index] = *index; + return VK_SUCCESS; +} + +static VkResult +hk_image_view_init(struct hk_device *dev, struct hk_image_view *view, + bool driver_internal, + const VkImageViewCreateInfo *pCreateInfo) +{ + VK_FROM_HANDLE(hk_image, image, pCreateInfo->image); + VkResult result; + + memset(view, 0, sizeof(*view)); + + vk_image_view_init(&dev->vk, &view->vk, driver_internal, pCreateInfo); + + /* First, figure out which image planes we need. For depth/stencil, we only + * have one aspect viewed at a time. + */ + if (image->vk.aspects & + (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { + + view->plane_count = 1; + view->planes[0].image_plane = + hk_image_aspects_to_plane(image, view->vk.aspects); + } else { + /* For other formats, retrieve the plane count from the aspect mask + * and then walk through the aspect mask to map each image plane + * to its corresponding view plane + */ + assert(util_bitcount(view->vk.aspects) == + vk_format_get_plane_count(view->vk.format)); + view->plane_count = 0; + u_foreach_bit(aspect_bit, view->vk.aspects) { + uint8_t image_plane = + hk_image_aspects_to_plane(image, 1u << aspect_bit); + view->planes[view->plane_count++].image_plane = image_plane; + } + } + + struct agx_texture_packed cached[HK_MAX_IMAGE_DESCS]; + + /* Finally, fill in each view plane separately */ + for (unsigned view_plane = 0; view_plane < view->plane_count; view_plane++) { + const struct { + VkImageUsageFlagBits flag; + enum hk_desc_usage usage; + uint32_t *tex; + uint32_t *pbe; + } descriptors[] = { + {VK_IMAGE_USAGE_SAMPLED_BIT, HK_DESC_USAGE_SAMPLED, + &view->planes[view_plane].sampled_desc_index}, + + {VK_IMAGE_USAGE_STORAGE_BIT, HK_DESC_USAGE_STORAGE, + &view->planes[view_plane].ro_storage_desc_index, + &view->planes[view_plane].storage_desc_index}, + + {VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT, HK_DESC_USAGE_INPUT, + &view->planes[view_plane].ia_desc_index}, + + {VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, HK_DESC_USAGE_BG_EOT, + &view->planes[view_plane].background_desc_index, + &view->planes[view_plane].eot_pbe_desc_index}, + + {VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, HK_DESC_USAGE_LAYERED_BG_EOT, + &view->planes[view_plane].layered_background_desc_index, + &view->planes[view_plane].layered_eot_pbe_desc_index}, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(descriptors); ++i) { + if (!(view->vk.usage & descriptors[i].flag)) + continue; + + for (unsigned is_pbe = 0; is_pbe < 2; ++is_pbe) { + struct agx_texture_packed desc; + uint32_t *out = is_pbe ? descriptors[i].pbe : descriptors[i].tex; + + if (!out) + continue; + + if (is_pbe) { + static_assert(sizeof(struct agx_pbe_packed) == + sizeof(struct agx_texture_packed)); + + pack_pbe(dev, view, view_plane, descriptors[i].usage, + (struct agx_pbe_packed *)&desc); + } else { + pack_texture(view, view_plane, descriptors[i].usage, &desc); + } + + result = add_descriptor(dev, view, &desc, cached, out); + if (result != VK_SUCCESS) + return result; + } + } + + if (view->vk.usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { + pack_texture(view, view_plane, HK_DESC_USAGE_EMRT, + &view->planes[view_plane].emrt_texture); + + pack_pbe(dev, view, view_plane, HK_DESC_USAGE_EMRT, + &view->planes[view_plane].emrt_pbe); + } + } + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +hk_DestroyImageView(VkDevice _device, VkImageView imageView, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_device, dev, _device); + VK_FROM_HANDLE(hk_image_view, view, imageView); + + if (!view) + return; + + for (uint8_t d = 0; d < view->descriptor_count; ++d) { + hk_descriptor_table_remove(dev, &dev->images, view->descriptor_index[d]); + } + + vk_image_view_finish(&view->vk); + vk_free2(&dev->vk.alloc, pAllocator, view); +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateImageView(VkDevice _device, const VkImageViewCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, VkImageView *pView) +{ + VK_FROM_HANDLE(hk_device, dev, _device); + struct hk_image_view *view; + VkResult result; + + view = vk_alloc2(&dev->vk.alloc, pAllocator, sizeof(*view), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!view) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + result = hk_image_view_init( + dev, view, pCreateInfo->flags & VK_IMAGE_VIEW_CREATE_INTERNAL_MESA, + pCreateInfo); + if (result != VK_SUCCESS) { + hk_DestroyImageView(_device, hk_image_view_to_handle(view), pAllocator); + return result; + } + + *pView = hk_image_view_to_handle(view); + + return VK_SUCCESS; +} diff --git a/src/asahi/vulkan/hk_image_view.h b/src/asahi/vulkan/hk_image_view.h new file mode 100644 index 00000000000..4a5c7c79fb7 --- /dev/null +++ b/src/asahi/vulkan/hk_image_view.h @@ -0,0 +1,66 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "agx_pack.h" +#include "hk_private.h" +#include "vk_image.h" + +struct hk_device; + +#define HK_MAX_PLANES 3 +#define HK_MAX_IMAGE_DESCS (10 * HK_MAX_PLANES) + +struct hk_image_view { + struct vk_image_view vk; + + uint32_t descriptor_index[HK_MAX_IMAGE_DESCS]; + uint8_t descriptor_count; + + uint8_t plane_count; + struct { + uint8_t image_plane; + + /** Descriptors used for eMRT. We delay upload since we want them + * contiguous in memory, although this could be reworked if we wanted. + */ + struct agx_texture_packed emrt_texture; + struct agx_pbe_packed emrt_pbe; + + /** Index in the image descriptor table for the sampled image descriptor */ + uint32_t sampled_desc_index; + + /** Index in the image descriptor table for the storage image descriptor */ + uint32_t storage_desc_index; + + /** Index in the image descriptor table for the readonly storage image + * descriptor. + */ + uint32_t ro_storage_desc_index; + + /** Index in the image descriptor table for the texture descriptor used + * for background programs. + */ + uint32_t background_desc_index; + uint32_t layered_background_desc_index; + + /** Index in the image descriptor table for the texture descriptor used + * for input attachments. + */ + uint32_t ia_desc_index; + + /** Index in the image descriptor table for the PBE descriptor used for + * end-of-tile programs. + */ + uint32_t eot_pbe_desc_index; + uint32_t layered_eot_pbe_desc_index; + } planes[3]; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_image_view, vk.base, VkImageView, + VK_OBJECT_TYPE_IMAGE_VIEW) diff --git a/src/asahi/vulkan/hk_instance.c b/src/asahi/vulkan/hk_instance.c new file mode 100644 index 00000000000..fdf113f0edf --- /dev/null +++ b/src/asahi/vulkan/hk_instance.c @@ -0,0 +1,196 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_instance.h" + +#include "hk_entrypoints.h" +#include "hk_physical_device.h" + +#include "vulkan/wsi/wsi_common.h" + +#include "util/build_id.h" +#include "util/driconf.h" +#include "util/mesa-sha1.h" + +VKAPI_ATTR VkResult VKAPI_CALL +hk_EnumerateInstanceVersion(uint32_t *pApiVersion) +{ + uint32_t version_override = vk_get_version_override(); + *pApiVersion = version_override ? version_override + : VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION); + + return VK_SUCCESS; +} + +static const struct vk_instance_extension_table instance_extensions = { +#ifdef HK_USE_WSI_PLATFORM + .KHR_get_surface_capabilities2 = true, + .KHR_surface = true, + .KHR_surface_protected_capabilities = true, + .EXT_surface_maintenance1 = true, + .EXT_swapchain_colorspace = true, +#endif +#ifdef VK_USE_PLATFORM_WAYLAND_KHR + .KHR_wayland_surface = true, +#endif +#ifdef VK_USE_PLATFORM_XCB_KHR + .KHR_xcb_surface = true, +#endif +#ifdef VK_USE_PLATFORM_XLIB_KHR + .KHR_xlib_surface = true, +#endif +#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT + .EXT_acquire_xlib_display = true, +#endif +#ifdef VK_USE_PLATFORM_DISPLAY_KHR + .KHR_display = true, + .KHR_get_display_properties2 = true, + .EXT_direct_mode_display = true, + .EXT_display_surface_counter = true, + .EXT_acquire_drm_display = true, +#endif +#ifndef VK_USE_PLATFORM_WIN32_KHR + .EXT_headless_surface = true, +#endif + .KHR_device_group_creation = true, + .KHR_external_fence_capabilities = true, + .KHR_external_memory_capabilities = true, + .KHR_external_semaphore_capabilities = true, + .KHR_get_physical_device_properties2 = true, + .EXT_debug_report = true, + .EXT_debug_utils = true, +}; + +VKAPI_ATTR VkResult VKAPI_CALL +hk_EnumerateInstanceExtensionProperties(const char *pLayerName, + uint32_t *pPropertyCount, + VkExtensionProperties *pProperties) +{ + if (pLayerName) + return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT); + + return vk_enumerate_instance_extension_properties( + &instance_extensions, pPropertyCount, pProperties); +} + +static const driOptionDescription hk_dri_options[] = { + DRI_CONF_SECTION_PERFORMANCE DRI_CONF_ADAPTIVE_SYNC(true) + DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0) + DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false) + DRI_CONF_VK_X11_ENSURE_MIN_IMAGE_COUNT(false) + DRI_CONF_VK_KHR_PRESENT_WAIT(false) + DRI_CONF_VK_XWAYLAND_WAIT_READY(false) DRI_CONF_SECTION_END + + DRI_CONF_SECTION_DEBUG DRI_CONF_FORCE_VK_VENDOR() + DRI_CONF_VK_WSI_FORCE_SWAPCHAIN_TO_CURRENT_EXTENT(false) + DRI_CONF_VK_X11_IGNORE_SUBOPTIMAL(false) + DRI_CONF_SECTION_END}; + +static void +hk_init_dri_options(struct hk_instance *instance) +{ + driParseOptionInfo(&instance->available_dri_options, hk_dri_options, + ARRAY_SIZE(hk_dri_options)); + driParseConfigFiles( + &instance->dri_options, &instance->available_dri_options, 0, "hk", NULL, + NULL, instance->vk.app_info.app_name, instance->vk.app_info.app_version, + instance->vk.app_info.engine_name, instance->vk.app_info.engine_version); + + instance->force_vk_vendor = + driQueryOptioni(&instance->dri_options, "force_vk_vendor"); +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateInstance(const VkInstanceCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkInstance *pInstance) +{ + struct hk_instance *instance; + VkResult result; + + if (pAllocator == NULL) + pAllocator = vk_default_allocator(); + + instance = vk_alloc(pAllocator, sizeof(*instance), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (!instance) + return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); + + struct vk_instance_dispatch_table dispatch_table; + vk_instance_dispatch_table_from_entrypoints(&dispatch_table, + &hk_instance_entrypoints, true); + vk_instance_dispatch_table_from_entrypoints( + &dispatch_table, &wsi_instance_entrypoints, false); + + result = vk_instance_init(&instance->vk, &instance_extensions, + &dispatch_table, pCreateInfo, pAllocator); + if (result != VK_SUCCESS) + goto fail_alloc; + + hk_init_dri_options(instance); + + instance->vk.physical_devices.try_create_for_drm = + hk_create_drm_physical_device; + instance->vk.physical_devices.destroy = hk_physical_device_destroy; + + const struct build_id_note *note = + build_id_find_nhdr_for_addr(hk_CreateInstance); + if (!note) { + result = vk_errorf(NULL, VK_ERROR_INITIALIZATION_FAILED, + "Failed to find build-id"); + goto fail_init; + } + + unsigned build_id_len = build_id_length(note); + if (build_id_len < SHA1_DIGEST_LENGTH) { + result = vk_errorf(NULL, VK_ERROR_INITIALIZATION_FAILED, + "build-id too short. It needs to be a SHA"); + goto fail_init; + } + + static_assert(sizeof(instance->driver_build_sha) == SHA1_DIGEST_LENGTH); + memcpy(instance->driver_build_sha, build_id_data(note), SHA1_DIGEST_LENGTH); + + *pInstance = hk_instance_to_handle(instance); + return VK_SUCCESS; + +fail_init: + vk_instance_finish(&instance->vk); +fail_alloc: + vk_free(pAllocator, instance); + + return result; +} + +VKAPI_ATTR void VKAPI_CALL +hk_DestroyInstance(VkInstance _instance, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_instance, instance, _instance); + + if (!instance) + return; + + driDestroyOptionCache(&instance->dri_options); + driDestroyOptionInfo(&instance->available_dri_options); + + vk_instance_finish(&instance->vk); + vk_free(&instance->vk.alloc, instance); +} + +VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL +hk_GetInstanceProcAddr(VkInstance _instance, const char *pName) +{ + VK_FROM_HANDLE(hk_instance, instance, _instance); + return vk_instance_get_proc_addr(&instance->vk, &hk_instance_entrypoints, + pName); +} + +PUBLIC VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL +vk_icdGetInstanceProcAddr(VkInstance instance, const char *pName) +{ + return hk_GetInstanceProcAddr(instance, pName); +} diff --git a/src/asahi/vulkan/hk_instance.h b/src/asahi/vulkan/hk_instance.h new file mode 100644 index 00000000000..d0c0397b02a --- /dev/null +++ b/src/asahi/vulkan/hk_instance.h @@ -0,0 +1,25 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "util/xmlconfig.h" +#include "hk_private.h" +#include "vk_instance.h" + +struct hk_instance { + struct vk_instance vk; + + struct driOptionCache dri_options; + struct driOptionCache available_dri_options; + + uint8_t driver_build_sha[20]; + uint32_t force_vk_vendor; +}; + +VK_DEFINE_HANDLE_CASTS(hk_instance, vk.base, VkInstance, + VK_OBJECT_TYPE_INSTANCE) diff --git a/src/asahi/vulkan/hk_nir_lower_descriptors.c b/src/asahi/vulkan/hk_nir_lower_descriptors.c new file mode 100644 index 00000000000..802e184ae5e --- /dev/null +++ b/src/asahi/vulkan/hk_nir_lower_descriptors.c @@ -0,0 +1,867 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "pipe/p_defines.h" +#include "vulkan/vulkan_core.h" +#include "agx_nir_passes.h" +#include "agx_pack.h" +#include "hk_cmd_buffer.h" +#include "hk_descriptor_set.h" +#include "hk_descriptor_set_layout.h" +#include "hk_shader.h" + +#include "nir.h" +#include "nir_builder.h" +#include "nir_builder_opcodes.h" +#include "nir_deref.h" +#include "nir_intrinsics.h" +#include "nir_intrinsics_indices.h" +#include "shader_enums.h" +#include "vk_pipeline.h" + +struct lower_descriptors_ctx { + const struct hk_descriptor_set_layout *set_layouts[HK_MAX_SETS]; + + bool clamp_desc_array_bounds; + nir_address_format ubo_addr_format; + nir_address_format ssbo_addr_format; +}; + +static const struct hk_descriptor_set_binding_layout * +get_binding_layout(uint32_t set, uint32_t binding, + const struct lower_descriptors_ctx *ctx) +{ + assert(set < HK_MAX_SETS); + assert(ctx->set_layouts[set] != NULL); + + const struct hk_descriptor_set_layout *set_layout = ctx->set_layouts[set]; + + assert(binding < set_layout->binding_count); + return &set_layout->binding[binding]; +} + +static nir_def * +load_speculatable(nir_builder *b, unsigned num_components, unsigned bit_size, + nir_def *addr, unsigned align) +{ + return nir_build_load_global_constant(b, num_components, bit_size, addr, + .align_mul = align, + .access = ACCESS_CAN_SPECULATE); +} + +static nir_def * +load_root(nir_builder *b, unsigned num_components, unsigned bit_size, + nir_def *offset, unsigned align) +{ + nir_def *root = nir_load_preamble(b, 1, 64, .base = HK_ROOT_UNIFORM); + + /* We've bound the address of the root descriptor, index in. */ + nir_def *addr = nir_iadd(b, root, nir_u2u64(b, offset)); + + return load_speculatable(b, num_components, bit_size, addr, align); +} + +static bool +lower_load_constant(nir_builder *b, nir_intrinsic_instr *load, + const struct lower_descriptors_ctx *ctx) +{ + assert(load->intrinsic == nir_intrinsic_load_constant); + unreachable("todo: stick an address in the root descriptor or something"); + + uint32_t base = nir_intrinsic_base(load); + uint32_t range = nir_intrinsic_range(load); + + b->cursor = nir_before_instr(&load->instr); + + nir_def *offset = nir_iadd_imm(b, load->src[0].ssa, base); + nir_def *data = nir_load_ubo( + b, load->def.num_components, load->def.bit_size, nir_imm_int(b, 0), + offset, .align_mul = nir_intrinsic_align_mul(load), + .align_offset = nir_intrinsic_align_offset(load), .range_base = base, + .range = range); + + nir_def_rewrite_uses(&load->def, data); + + return true; +} + +static nir_def * +load_descriptor_set_addr(nir_builder *b, uint32_t set, + UNUSED const struct lower_descriptors_ctx *ctx) +{ + uint32_t set_addr_offset = + hk_root_descriptor_offset(sets) + set * sizeof(uint64_t); + + return load_root(b, 1, 64, nir_imm_int(b, set_addr_offset), 8); +} + +static nir_def * +load_dynamic_buffer_start(nir_builder *b, uint32_t set, + const struct lower_descriptors_ctx *ctx) +{ + int dynamic_buffer_start_imm = 0; + for (uint32_t s = 0; s < set; s++) { + if (ctx->set_layouts[s] == NULL) { + dynamic_buffer_start_imm = -1; + break; + } + + dynamic_buffer_start_imm += ctx->set_layouts[s]->dynamic_buffer_count; + } + + if (dynamic_buffer_start_imm >= 0) { + return nir_imm_int(b, dynamic_buffer_start_imm); + } else { + uint32_t root_offset = + hk_root_descriptor_offset(set_dynamic_buffer_start) + set; + + return nir_u2u32(b, load_root(b, 1, 8, nir_imm_int(b, root_offset), 1)); + } +} + +static nir_def * +load_descriptor(nir_builder *b, unsigned num_components, unsigned bit_size, + uint32_t set, uint32_t binding, nir_def *index, + unsigned offset_B, const struct lower_descriptors_ctx *ctx) +{ + const struct hk_descriptor_set_binding_layout *binding_layout = + get_binding_layout(set, binding, ctx); + + if (ctx->clamp_desc_array_bounds) + index = + nir_umin(b, index, nir_imm_int(b, binding_layout->array_size - 1)); + + switch (binding_layout->type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + /* Get the index in the root descriptor table dynamic_buffers array. */ + nir_def *dynamic_buffer_start = load_dynamic_buffer_start(b, set, ctx); + + index = nir_iadd(b, index, + nir_iadd_imm(b, dynamic_buffer_start, + binding_layout->dynamic_buffer_index)); + + nir_def *root_desc_offset = nir_iadd_imm( + b, nir_imul_imm(b, index, sizeof(struct hk_buffer_address)), + hk_root_descriptor_offset(dynamic_buffers)); + + assert(num_components == 4 && bit_size == 32); + nir_def *desc = load_root(b, 4, 32, root_desc_offset, 16); + + /* We know a priori that the the .w compnent (offset) is zero */ + return nir_vector_insert_imm(b, desc, nir_imm_int(b, 0), 3); + } + + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: { + nir_def *base_addr = nir_iadd_imm( + b, load_descriptor_set_addr(b, set, ctx), binding_layout->offset); + + assert(binding_layout->stride == 1); + const uint32_t binding_size = binding_layout->array_size; + + /* Convert it to nir_address_format_64bit_bounded_global */ + assert(num_components == 4 && bit_size == 32); + return nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_addr), + nir_unpack_64_2x32_split_y(b, base_addr), + nir_imm_int(b, binding_size), nir_imm_int(b, 0)); + } + + default: { + assert(binding_layout->stride > 0); + nir_def *desc_ubo_offset = + nir_iadd_imm(b, nir_imul_imm(b, index, binding_layout->stride), + binding_layout->offset + offset_B); + + unsigned desc_align_mul = (1 << (ffs(binding_layout->stride) - 1)); + desc_align_mul = MIN2(desc_align_mul, 16); + unsigned desc_align_offset = binding_layout->offset + offset_B; + desc_align_offset %= desc_align_mul; + + nir_def *desc; + nir_def *set_addr = load_descriptor_set_addr(b, set, ctx); + desc = nir_load_global_constant_offset( + b, num_components, bit_size, set_addr, desc_ubo_offset, + .align_mul = desc_align_mul, .align_offset = desc_align_offset, + .access = ACCESS_CAN_SPECULATE); + + if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER || + binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER) { + /* We know a priori that the the .w compnent (offset) is zero */ + assert(num_components == 4 && bit_size == 32); + desc = nir_vector_insert_imm(b, desc, nir_imm_int(b, 0), 3); + } + return desc; + } + } +} + +static bool +is_idx_intrin(nir_intrinsic_instr *intrin) +{ + while (intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) { + intrin = nir_src_as_intrinsic(intrin->src[0]); + if (intrin == NULL) + return false; + } + + return intrin->intrinsic == nir_intrinsic_vulkan_resource_index; +} + +static nir_def * +load_descriptor_for_idx_intrin(nir_builder *b, nir_intrinsic_instr *intrin, + const struct lower_descriptors_ctx *ctx) +{ + nir_def *index = nir_imm_int(b, 0); + + while (intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) { + index = nir_iadd(b, index, intrin->src[1].ssa); + intrin = nir_src_as_intrinsic(intrin->src[0]); + } + + assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index); + uint32_t set = nir_intrinsic_desc_set(intrin); + uint32_t binding = nir_intrinsic_binding(intrin); + index = nir_iadd(b, index, intrin->src[0].ssa); + + return load_descriptor(b, 4, 32, set, binding, index, 0, ctx); +} + +static bool +try_lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin, + const struct lower_descriptors_ctx *ctx) +{ + ASSERTED const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin); + b->cursor = nir_before_instr(&intrin->instr); + + nir_intrinsic_instr *idx_intrin = nir_src_as_intrinsic(intrin->src[0]); + if (idx_intrin == NULL || !is_idx_intrin(idx_intrin)) { + assert(desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER || + desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC); + return false; + } + + nir_def *desc = load_descriptor_for_idx_intrin(b, idx_intrin, ctx); + + nir_def_rewrite_uses(&intrin->def, desc); + + return true; +} + +static bool +_lower_sysval_to_root_table(nir_builder *b, nir_intrinsic_instr *intrin, + uint32_t root_table_offset) +{ + b->cursor = nir_instr_remove(&intrin->instr); + assert((root_table_offset & 3) == 0 && "aligned"); + + nir_def *val = load_root(b, intrin->def.num_components, intrin->def.bit_size, + nir_imm_int(b, root_table_offset), 4); + + nir_def_rewrite_uses(&intrin->def, val); + + return true; +} + +#define lower_sysval_to_root_table(b, intrin, member) \ + _lower_sysval_to_root_table(b, intrin, hk_root_descriptor_offset(member)) + +static bool +lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *load, + const struct lower_descriptors_ctx *ctx) +{ + const uint32_t push_region_offset = hk_root_descriptor_offset(push); + const uint32_t base = nir_intrinsic_base(load); + + b->cursor = nir_before_instr(&load->instr); + + nir_def *offset = + nir_iadd_imm(b, load->src[0].ssa, push_region_offset + base); + + nir_def *val = load_root(b, load->def.num_components, load->def.bit_size, + offset, load->def.bit_size / 8); + + nir_def_rewrite_uses(&load->def, val); + + return true; +} + +static void +get_resource_deref_binding(nir_builder *b, nir_deref_instr *deref, + uint32_t *set, uint32_t *binding, nir_def **index) +{ + if (deref->deref_type == nir_deref_type_array) { + *index = deref->arr.index.ssa; + deref = nir_deref_instr_parent(deref); + } else { + *index = nir_imm_int(b, 0); + } + + assert(deref->deref_type == nir_deref_type_var); + nir_variable *var = deref->var; + + *set = var->data.descriptor_set; + *binding = var->data.binding; +} + +static nir_def * +load_resource_deref_desc(nir_builder *b, unsigned num_components, + unsigned bit_size, nir_deref_instr *deref, + unsigned offset_B, + const struct lower_descriptors_ctx *ctx) +{ + uint32_t set, binding; + nir_def *index; + get_resource_deref_binding(b, deref, &set, &binding, &index); + return load_descriptor(b, num_components, bit_size, set, binding, index, + offset_B, ctx); +} + +/* + * Returns an AGX bindless handle to access an indexed image within the global + * image heap. + */ +static nir_def * +image_heap_handle(nir_builder *b, nir_def *offset) +{ + return nir_vec2(b, nir_imm_int(b, HK_IMAGE_HEAP_UNIFORM), offset); +} + +static bool +lower_image_intrin(nir_builder *b, nir_intrinsic_instr *intr, + const struct lower_descriptors_ctx *ctx) +{ + b->cursor = nir_before_instr(&intr->instr); + nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); + + /* Reads and queries use the texture descriptor; writes and atomics PBE. */ + unsigned offs; + if (intr->intrinsic != nir_intrinsic_image_deref_load && + intr->intrinsic != nir_intrinsic_image_deref_size && + intr->intrinsic != nir_intrinsic_image_deref_samples) { + + offs = offsetof(struct hk_storage_image_descriptor, pbe_offset); + } else { + offs = offsetof(struct hk_storage_image_descriptor, tex_offset); + } + + nir_def *offset = load_resource_deref_desc(b, 1, 32, deref, offs, ctx); + nir_rewrite_image_intrinsic(intr, image_heap_handle(b, offset), true); + + return true; +} + +static VkQueryPipelineStatisticFlagBits +translate_pipeline_stat_bit(enum pipe_statistics_query_index pipe) +{ + switch (pipe) { + case PIPE_STAT_QUERY_IA_VERTICES: + return VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT; + case PIPE_STAT_QUERY_IA_PRIMITIVES: + return VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT; + case PIPE_STAT_QUERY_VS_INVOCATIONS: + return VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT; + case PIPE_STAT_QUERY_GS_INVOCATIONS: + return VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT; + case PIPE_STAT_QUERY_GS_PRIMITIVES: + return VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT; + case PIPE_STAT_QUERY_C_INVOCATIONS: + return VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT; + case PIPE_STAT_QUERY_C_PRIMITIVES: + return VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT; + case PIPE_STAT_QUERY_PS_INVOCATIONS: + return VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT; + case PIPE_STAT_QUERY_HS_INVOCATIONS: + return VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT; + case PIPE_STAT_QUERY_DS_INVOCATIONS: + return VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT; + case PIPE_STAT_QUERY_CS_INVOCATIONS: + return VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT; + case PIPE_STAT_QUERY_TS_INVOCATIONS: + return VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT; + case PIPE_STAT_QUERY_MS_INVOCATIONS: + return VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT; + } + + unreachable("invalid statistic"); +} + +static bool +lower_uvs_index(nir_builder *b, nir_intrinsic_instr *intrin, void *data) +{ + unsigned *vs_uniform_base = data; + + switch (intrin->intrinsic) { + case nir_intrinsic_load_uvs_index_agx: { + gl_varying_slot slot = nir_intrinsic_io_semantics(intrin).location; + unsigned offset = hk_root_descriptor_offset(draw.uvs_index[slot]); + b->cursor = nir_instr_remove(&intrin->instr); + + nir_def *val = load_root(b, 1, 8, nir_imm_int(b, offset), 1); + nir_def_rewrite_uses(&intrin->def, nir_u2u16(b, val)); + return true; + } + + case nir_intrinsic_load_shader_part_tests_zs_agx: + return lower_sysval_to_root_table(b, intrin, draw.no_epilog_discard); + + case nir_intrinsic_load_api_sample_mask_agx: + return lower_sysval_to_root_table(b, intrin, draw.api_sample_mask); + + case nir_intrinsic_load_sample_positions_agx: + return lower_sysval_to_root_table(b, intrin, draw.ppp_multisamplectl); + + case nir_intrinsic_load_depth_never_agx: + return lower_sysval_to_root_table(b, intrin, draw.force_never_in_shader); + + case nir_intrinsic_load_geometry_param_buffer_agx: + return lower_sysval_to_root_table(b, intrin, draw.geometry_params); + + case nir_intrinsic_load_vs_output_buffer_agx: + return lower_sysval_to_root_table(b, intrin, draw.vertex_output_buffer); + + case nir_intrinsic_load_vs_outputs_agx: + return lower_sysval_to_root_table(b, intrin, draw.vertex_outputs); + + case nir_intrinsic_load_tess_param_buffer_agx: + return lower_sysval_to_root_table(b, intrin, draw.tess_params); + + case nir_intrinsic_load_is_first_fan_agx: { + unsigned offset = hk_root_descriptor_offset(draw.provoking); + b->cursor = nir_instr_remove(&intrin->instr); + nir_def *val = load_root(b, 1, 16, nir_imm_int(b, offset), 2); + nir_def_rewrite_uses(&intrin->def, nir_ieq_imm(b, val, 1)); + return true; + } + + case nir_intrinsic_load_provoking_last: { + unsigned offset = hk_root_descriptor_offset(draw.provoking); + b->cursor = nir_instr_remove(&intrin->instr); + nir_def *val = load_root(b, 1, 16, nir_imm_int(b, offset), 2); + nir_def_rewrite_uses(&intrin->def, nir_b2b32(b, nir_ieq_imm(b, val, 2))); + return true; + } + + case nir_intrinsic_load_base_vertex: + case nir_intrinsic_load_first_vertex: + case nir_intrinsic_load_base_instance: + case nir_intrinsic_load_draw_id: + case nir_intrinsic_load_input_assembly_buffer_agx: { + b->cursor = nir_instr_remove(&intrin->instr); + + unsigned base = *vs_uniform_base; + unsigned size = 32; + + if (intrin->intrinsic == nir_intrinsic_load_base_instance) { + base += 2; + } else if (intrin->intrinsic == nir_intrinsic_load_draw_id) { + base += 4; + size = 16; + } else if (intrin->intrinsic == + nir_intrinsic_load_input_assembly_buffer_agx) { + base += 8; + size = 64; + } + + nir_def *val = nir_load_preamble(b, 1, size, .base = base); + nir_def_rewrite_uses(&intrin->def, + nir_u2uN(b, val, intrin->def.bit_size)); + return true; + } + + case nir_intrinsic_load_stat_query_address_agx: { + b->cursor = nir_instr_remove(&intrin->instr); + + unsigned off1 = hk_root_descriptor_offset(draw.pipeline_stats); + unsigned off2 = hk_root_descriptor_offset(draw.pipeline_stats_flags); + + nir_def *base = load_root(b, 1, 64, nir_imm_int(b, off1), 8); + nir_def *flags = load_root(b, 1, 16, nir_imm_int(b, off2), 2); + + unsigned query = nir_intrinsic_base(intrin); + VkQueryPipelineStatisticFlagBits bit = translate_pipeline_stat_bit(query); + + /* Prefix sum to find the compacted offset */ + nir_def *idx = nir_bit_count(b, nir_iand_imm(b, flags, bit - 1)); + nir_def *addr = nir_iadd( + b, base, nir_imul_imm(b, nir_u2u64(b, idx), sizeof(uint64_t))); + + /* The above returns garbage if the query isn't actually enabled, handle + * that case. + * + * TODO: Optimize case where we *know* the query is present? + */ + nir_def *present = nir_ine_imm(b, nir_iand_imm(b, flags, bit), 0); + addr = nir_bcsel(b, present, addr, nir_imm_int64(b, 0)); + + nir_def_rewrite_uses(&intrin->def, addr); + return true; + } + + default: + return false; + } +} + +bool +hk_lower_uvs_index(nir_shader *s, unsigned vs_uniform_base) +{ + return nir_shader_intrinsics_pass( + s, lower_uvs_index, nir_metadata_control_flow, &vs_uniform_base); +} + +static bool +try_lower_intrin(nir_builder *b, nir_intrinsic_instr *intrin, + const struct lower_descriptors_ctx *ctx) +{ + switch (intrin->intrinsic) { + case nir_intrinsic_load_constant: + return lower_load_constant(b, intrin, ctx); + + case nir_intrinsic_load_vulkan_descriptor: + return try_lower_load_vulkan_descriptor(b, intrin, ctx); + + case nir_intrinsic_load_workgroup_size: + unreachable("Should have been lowered by nir_lower_cs_intrinsics()"); + + case nir_intrinsic_load_base_workgroup_id: + return lower_sysval_to_root_table(b, intrin, cs.base_group); + + case nir_intrinsic_load_push_constant: + return lower_load_push_constant(b, intrin, ctx); + + case nir_intrinsic_load_view_index: + return lower_sysval_to_root_table(b, intrin, draw.view_index); + + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_sparse_load: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic: + case nir_intrinsic_image_deref_atomic_swap: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_deref_samples: + return lower_image_intrin(b, intrin, ctx); + + case nir_intrinsic_load_num_workgroups: { + b->cursor = nir_instr_remove(&intrin->instr); + + unsigned offset = hk_root_descriptor_offset(cs.group_count_addr); + nir_def *ptr = load_root(b, 1, 64, nir_imm_int(b, offset), 4); + nir_def *val = load_speculatable(b, 3, 32, ptr, 4); + + nir_def_rewrite_uses(&intrin->def, val); + return true; + } + + default: + return false; + } +} + +static bool +lower_tex(nir_builder *b, nir_tex_instr *tex, + const struct lower_descriptors_ctx *ctx) +{ + b->cursor = nir_before_instr(&tex->instr); + + nir_def *texture = nir_steal_tex_src(tex, nir_tex_src_texture_deref); + nir_def *sampler = nir_steal_tex_src(tex, nir_tex_src_sampler_deref); + if (!texture) { + assert(!sampler); + return false; + } + + nir_def *plane_ssa = nir_steal_tex_src(tex, nir_tex_src_plane); + const uint32_t plane = + plane_ssa ? nir_src_as_uint(nir_src_for_ssa(plane_ssa)) : 0; + const uint64_t plane_offset_B = + plane * sizeof(struct hk_sampled_image_descriptor); + + /* LOD bias is passed in the descriptor set, rather than embedded into + * the sampler descriptor. There's no spot in the hardware descriptor, + * plus this saves on precious sampler heap spots. + */ + if (tex->op == nir_texop_lod_bias_agx) { + unsigned offs = + offsetof(struct hk_sampled_image_descriptor, lod_bias_fp16); + + nir_def *bias = load_resource_deref_desc( + b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)), + plane_offset_B + offs, ctx); + + nir_def_replace(&tex->def, bias); + return true; + } + + if (tex->op == nir_texop_has_custom_border_color_agx) { + unsigned offs = offsetof(struct hk_sampled_image_descriptor, has_border); + + nir_def *res = load_resource_deref_desc( + b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)), + plane_offset_B + offs, ctx); + + nir_def_replace(&tex->def, nir_ine_imm(b, res, 0)); + return true; + } + + if (tex->op == nir_texop_custom_border_color_agx) { + unsigned offs = offsetof(struct hk_sampled_image_descriptor, border); + + nir_def *border = load_resource_deref_desc( + b, 4, 32, nir_src_as_deref(nir_src_for_ssa(sampler)), + plane_offset_B + offs, ctx); + + nir_alu_type T = nir_alu_type_get_base_type(tex->dest_type); + border = nir_convert_to_bit_size(b, border, T, tex->def.bit_size); + + nir_def_replace(&tex->def, border); + return true; + } + + { + unsigned offs = + offsetof(struct hk_sampled_image_descriptor, image_offset); + + nir_def *offset = load_resource_deref_desc( + b, 1, 32, nir_src_as_deref(nir_src_for_ssa(texture)), + plane_offset_B + offs, ctx); + + nir_def *handle = image_heap_handle(b, offset); + nir_tex_instr_add_src(tex, nir_tex_src_texture_handle, handle); + } + + if (sampler != NULL) { + unsigned offs = + offsetof(struct hk_sampled_image_descriptor, sampler_index); + + if (tex->backend_flags & AGX_TEXTURE_FLAG_CLAMP_TO_0) { + offs = + offsetof(struct hk_sampled_image_descriptor, clamp_0_sampler_index); + } + + nir_def *index = load_resource_deref_desc( + b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)), + plane_offset_B + offs, ctx); + + nir_tex_instr_add_src(tex, nir_tex_src_sampler_handle, index); + } + + return true; +} + +static bool +try_lower_descriptors_instr(nir_builder *b, nir_instr *instr, void *_data) +{ + const struct lower_descriptors_ctx *ctx = _data; + + switch (instr->type) { + case nir_instr_type_tex: + return lower_tex(b, nir_instr_as_tex(instr), ctx); + case nir_instr_type_intrinsic: + return try_lower_intrin(b, nir_instr_as_intrinsic(instr), ctx); + default: + return false; + } +} + +static bool +lower_ssbo_resource_index(nir_builder *b, nir_intrinsic_instr *intrin, + const struct lower_descriptors_ctx *ctx) +{ + const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin); + if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER && + desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) + return false; + + b->cursor = nir_instr_remove(&intrin->instr); + + uint32_t set = nir_intrinsic_desc_set(intrin); + uint32_t binding = nir_intrinsic_binding(intrin); + nir_def *index = intrin->src[0].ssa; + + const struct hk_descriptor_set_binding_layout *binding_layout = + get_binding_layout(set, binding, ctx); + + nir_def *binding_addr; + uint8_t binding_stride; + switch (binding_layout->type) { + case VK_DESCRIPTOR_TYPE_MUTABLE_EXT: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: { + nir_def *set_addr = load_descriptor_set_addr(b, set, ctx); + binding_addr = nir_iadd_imm(b, set_addr, binding_layout->offset); + binding_stride = binding_layout->stride; + break; + } + + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + const uint32_t root_desc_addr_offset = + hk_root_descriptor_offset(root_desc_addr); + + nir_def *root_desc_addr = + load_root(b, 1, 64, nir_imm_int(b, root_desc_addr_offset), 8); + + nir_def *dynamic_buffer_start = + nir_iadd_imm(b, load_dynamic_buffer_start(b, set, ctx), + binding_layout->dynamic_buffer_index); + + nir_def *dynamic_binding_offset = + nir_iadd_imm(b, + nir_imul_imm(b, dynamic_buffer_start, + sizeof(struct hk_buffer_address)), + hk_root_descriptor_offset(dynamic_buffers)); + + binding_addr = + nir_iadd(b, root_desc_addr, nir_u2u64(b, dynamic_binding_offset)); + binding_stride = sizeof(struct hk_buffer_address); + break; + } + + default: + unreachable("Not an SSBO descriptor"); + } + + /* Tuck the stride in the top 8 bits of the binding address */ + binding_addr = nir_ior_imm(b, binding_addr, (uint64_t)binding_stride << 56); + + const uint32_t binding_size = binding_layout->array_size * binding_stride; + nir_def *offset_in_binding = nir_imul_imm(b, index, binding_stride); + + nir_def *addr = nir_vec4(b, nir_unpack_64_2x32_split_x(b, binding_addr), + nir_unpack_64_2x32_split_y(b, binding_addr), + nir_imm_int(b, binding_size), offset_in_binding); + + nir_def_rewrite_uses(&intrin->def, addr); + + return true; +} + +static bool +lower_ssbo_resource_reindex(nir_builder *b, nir_intrinsic_instr *intrin, + const struct lower_descriptors_ctx *ctx) +{ + const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin); + if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER && + desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) + return false; + + b->cursor = nir_instr_remove(&intrin->instr); + + nir_def *addr = intrin->src[0].ssa; + nir_def *index = intrin->src[1].ssa; + + nir_def *addr_high32 = nir_channel(b, addr, 1); + nir_def *stride = nir_ushr_imm(b, addr_high32, 24); + nir_def *offset = nir_imul(b, index, stride); + + addr = nir_build_addr_iadd(b, addr, ctx->ssbo_addr_format, nir_var_mem_ssbo, + offset); + nir_def_rewrite_uses(&intrin->def, addr); + + return true; +} + +static bool +lower_load_ssbo_descriptor(nir_builder *b, nir_intrinsic_instr *intrin, + const struct lower_descriptors_ctx *ctx) +{ + const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin); + if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER && + desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) + return false; + + b->cursor = nir_instr_remove(&intrin->instr); + + nir_def *addr = intrin->src[0].ssa; + + nir_def *desc; + switch (ctx->ssbo_addr_format) { + case nir_address_format_64bit_global_32bit_offset: { + nir_def *base = nir_pack_64_2x32(b, nir_trim_vector(b, addr, 2)); + nir_def *offset = nir_channel(b, addr, 3); + /* Mask off the binding stride */ + base = nir_iand_imm(b, base, BITFIELD64_MASK(56)); + desc = nir_load_global_constant_offset(b, 4, 32, base, offset, + .align_mul = 16, .align_offset = 0, + .access = ACCESS_CAN_SPECULATE); + break; + } + + case nir_address_format_64bit_bounded_global: { + nir_def *base = nir_pack_64_2x32(b, nir_trim_vector(b, addr, 2)); + nir_def *size = nir_channel(b, addr, 2); + nir_def *offset = nir_channel(b, addr, 3); + /* Mask off the binding stride */ + base = nir_iand_imm(b, base, BITFIELD64_MASK(56)); + desc = nir_load_global_constant_bounded( + b, 4, 32, base, offset, size, .align_mul = 16, .align_offset = 0, + .access = ACCESS_CAN_SPECULATE); + break; + } + + default: + unreachable("Unknown address mode"); + } + + nir_def_rewrite_uses(&intrin->def, desc); + + return true; +} + +static bool +lower_ssbo_descriptor(nir_builder *b, nir_intrinsic_instr *intr, void *_data) +{ + const struct lower_descriptors_ctx *ctx = _data; + + switch (intr->intrinsic) { + case nir_intrinsic_vulkan_resource_index: + return lower_ssbo_resource_index(b, intr, ctx); + case nir_intrinsic_vulkan_resource_reindex: + return lower_ssbo_resource_reindex(b, intr, ctx); + case nir_intrinsic_load_vulkan_descriptor: + return lower_load_ssbo_descriptor(b, intr, ctx); + default: + return false; + } +} + +bool +hk_nir_lower_descriptors(nir_shader *nir, + const struct vk_pipeline_robustness_state *rs, + uint32_t set_layout_count, + struct vk_descriptor_set_layout *const *set_layouts) +{ + struct lower_descriptors_ctx ctx = { + .clamp_desc_array_bounds = + rs->storage_buffers != + VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT || + + rs->uniform_buffers != + VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT || + + rs->images != VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT, + + .ssbo_addr_format = hk_buffer_addr_format(rs->storage_buffers), + .ubo_addr_format = hk_buffer_addr_format(rs->uniform_buffers), + }; + + assert(set_layout_count <= HK_MAX_SETS); + for (uint32_t s = 0; s < set_layout_count; s++) { + if (set_layouts[s] != NULL) + ctx.set_layouts[s] = vk_to_hk_descriptor_set_layout(set_layouts[s]); + } + + /* First lower everything but complex SSBOs, then lower complex SSBOs. + * + * TODO: See if we can unify this, not sure if the fast path matters on + * Apple. This is inherited from NVK. + */ + bool pass_lower_descriptors = nir_shader_instructions_pass( + nir, try_lower_descriptors_instr, nir_metadata_control_flow, &ctx); + + bool pass_lower_ssbo = nir_shader_intrinsics_pass( + nir, lower_ssbo_descriptor, nir_metadata_control_flow, &ctx); + + return pass_lower_descriptors || pass_lower_ssbo; +} diff --git a/src/asahi/vulkan/hk_nir_passthrough_gs.c b/src/asahi/vulkan/hk_nir_passthrough_gs.c new file mode 100644 index 00000000000..536b10c6b96 --- /dev/null +++ b/src/asahi/vulkan/hk_nir_passthrough_gs.c @@ -0,0 +1,112 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "util/bitscan.h" +#include "hk_shader.h" +#include "nir.h" +#include "nir_builder.h" +#include "nir_xfb_info.h" +#include "shader_enums.h" + +void +hk_nir_passthrough_gs(nir_builder *b, const void *key_) +{ + nir_shader *s = b->shader; + const struct hk_passthrough_gs_key *key = key_; + assert(key->prim == u_decomposed_prim(key->prim)); + assert(key->prim != MESA_PRIM_PATCHES && "tessellation consumes patches"); + + enum mesa_prim out; + if (key->prim == MESA_PRIM_POINTS) + out = MESA_PRIM_POINTS; + else if (u_reduced_prim(key->prim) == MESA_PRIM_LINES) + out = MESA_PRIM_LINE_STRIP; + else + out = MESA_PRIM_TRIANGLE_STRIP; + +#if 0 + assert((key->outputs & + (VARYING_BIT_BOUNDING_BOX0 | VARYING_BIT_BOUNDING_BOX1)) == 0 && + "cull distance lowering not run yet"); +#endif + /* XXX: need rework of preprocess_nir */ + uint64_t outputs = + key->outputs & ~(VARYING_BIT_BOUNDING_BOX0 | VARYING_BIT_BOUNDING_BOX1); + + s->info.outputs_written = s->info.inputs_read = outputs; + s->info.clip_distance_array_size = key->clip_distance_array_size; + s->info.cull_distance_array_size = key->cull_distance_array_size; + s->info.stage = MESA_SHADER_GEOMETRY; + s->info.gs.input_primitive = key->prim; + s->info.gs.output_primitive = out; + s->info.gs.vertices_in = mesa_vertices_per_prim(key->prim); + s->info.gs.vertices_out = mesa_vertices_per_prim(out); + s->info.gs.invocations = 1; + s->info.gs.active_stream_mask = 1; + + if (key->xfb_info.output_count) { + size_t size = nir_xfb_info_size(key->xfb_info.output_count); + s->xfb_info = ralloc_memdup(s, &key->xfb_info, size); + s->info.has_transform_feedback_varyings = true; + memcpy(s->info.xfb_stride, key->xfb_stride, sizeof(key->xfb_stride)); + } + + unsigned int start_vert = key->prim == MESA_PRIM_LINES_ADJACENCY ? 1 : 0; + unsigned int step = key->prim == MESA_PRIM_TRIANGLES_ADJACENCY ? 2 : 1; + + nir_def *zero = nir_imm_int(b, 0); + nir_def *one = nir_imm_int(b, 1); + + for (unsigned i = 0; i < s->info.gs.vertices_out; ++i) { + nir_def *vertex = nir_imm_int(b, start_vert + (i * step)); + + /* Copy inputs to outputs. */ + u_foreach_bit64(loc, outputs) { + unsigned adjusted_loc = loc; + nir_def *offset = zero; + unsigned num_slots = 1; + + bool scalar = loc == VARYING_SLOT_LAYER || + loc == VARYING_SLOT_VIEW_INDEX || + loc == VARYING_SLOT_VIEWPORT || loc == VARYING_SLOT_PSIZ; + unsigned comps = scalar ? 1 : 4; + + /* We use combined, compact clip/cull */ + if (loc == VARYING_SLOT_CLIP_DIST1 || loc == VARYING_SLOT_CULL_DIST1) { + adjusted_loc--; + offset = one; + } + + if (adjusted_loc == VARYING_SLOT_CLIP_DIST0 || + adjusted_loc == VARYING_SLOT_CULL_DIST0) { + num_slots = + key->cull_distance_array_size + key->clip_distance_array_size; + + if (loc > adjusted_loc) + comps = num_slots - 4; + else + comps = MIN2(num_slots, 4); + } + + nir_io_semantics sem = { + .location = adjusted_loc, + .num_slots = num_slots, + }; + + nir_def *val = nir_load_per_vertex_input(b, comps, 32, vertex, offset, + .io_semantics = sem); + + for (unsigned c = 0; c < comps; ++c) { + nir_store_output(b, nir_channel(b, val, c), offset, + .io_semantics = sem, .src_type = nir_type_uint32, + .component = c); + } + } + + nir_emit_vertex(b, 0); + } +} diff --git a/src/asahi/vulkan/hk_physical_device.c b/src/asahi/vulkan/hk_physical_device.c new file mode 100644 index 00000000000..304cc7c938d --- /dev/null +++ b/src/asahi/vulkan/hk_physical_device.c @@ -0,0 +1,1417 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_physical_device.h" + +#include "asahi/lib/agx_device.h" +#include "asahi/lib/agx_nir_lower_vbo.h" +#include "asahi/lib/agx_nir_passes.h" +#include "util/disk_cache.h" +#include "util/mesa-sha1.h" +#include "git_sha1.h" +#include "hk_buffer.h" +#include "hk_entrypoints.h" +#include "hk_image.h" +#include "hk_instance.h" +#include "hk_private.h" +#include "hk_shader.h" +#include "hk_wsi.h" + +#include "util/u_debug.h" +#include "vulkan/vulkan_core.h" +#include "vulkan/wsi/wsi_common.h" +#include "vk_device.h" +#include "vk_drm_syncobj.h" +#include "vk_shader_module.h" + +#include +#include +#include +#include +#include + +static uint32_t +hk_get_vk_version() +{ + /* Version override takes priority */ + const uint32_t version_override = vk_get_version_override(); + if (version_override) + return version_override; + + return VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION); +} + +static void +hk_get_device_extensions(const struct hk_instance *instance, + struct vk_device_extension_table *ext) +{ + *ext = (struct vk_device_extension_table){ + .KHR_8bit_storage = true, + .KHR_16bit_storage = true, + .KHR_bind_memory2 = true, + .KHR_buffer_device_address = true, + .KHR_calibrated_timestamps = false, + .KHR_copy_commands2 = true, + .KHR_create_renderpass2 = true, + .KHR_dedicated_allocation = true, + .KHR_depth_stencil_resolve = true, + .KHR_descriptor_update_template = true, + .KHR_device_group = true, + .KHR_draw_indirect_count = false, + .KHR_driver_properties = true, + .KHR_dynamic_rendering = true, + // TODO + .KHR_dynamic_rendering_local_read = false, + .KHR_external_fence = true, + .KHR_external_fence_fd = true, + .KHR_external_memory = true, + .KHR_external_memory_fd = true, + /* XXX: External timeline semaphores maybe broken in kernel, see + * dEQP-VK.synchronization.signal_order.shared_timeline_semaphore.write_copy_buffer_to_image_read_image_compute.image_128_r32_uint_opaque_fd + */ + .KHR_external_semaphore = false, + .KHR_external_semaphore_fd = false, + .KHR_format_feature_flags2 = true, + .KHR_fragment_shader_barycentric = false, + .KHR_get_memory_requirements2 = true, + .KHR_global_priority = true, + .KHR_image_format_list = true, + .KHR_imageless_framebuffer = true, +#ifdef HK_USE_WSI_PLATFORM + .KHR_incremental_present = true, +#endif + .KHR_index_type_uint8 = true, + .KHR_line_rasterization = true, + .KHR_load_store_op_none = true, + .KHR_maintenance1 = true, + .KHR_maintenance2 = true, + .KHR_maintenance3 = true, + .KHR_maintenance4 = true, + .KHR_maintenance5 = true, + .KHR_maintenance6 = true, + .KHR_map_memory2 = true, + .KHR_multiview = true, + .KHR_pipeline_executable_properties = true, + .KHR_pipeline_library = true, + .KHR_push_descriptor = true, + .KHR_relaxed_block_layout = true, + .KHR_sampler_mirror_clamp_to_edge = true, + .KHR_sampler_ycbcr_conversion = false, + .KHR_separate_depth_stencil_layouts = true, + .KHR_shader_atomic_int64 = false, + .KHR_shader_clock = false, + .KHR_shader_draw_parameters = true, + .KHR_shader_expect_assume = true, + .KHR_shader_float_controls = true, + // TODO: wait for nvk + .KHR_shader_float_controls2 = true, + .KHR_shader_float16_int8 = true, + .KHR_shader_integer_dot_product = true, + .KHR_shader_maximal_reconvergence = true, + .KHR_shader_non_semantic_info = true, + .KHR_shader_subgroup_extended_types = true, + .KHR_shader_subgroup_rotate = true, + .KHR_shader_subgroup_uniform_control_flow = true, + .KHR_shader_terminate_invocation = true, + .KHR_spirv_1_4 = true, + .KHR_storage_buffer_storage_class = true, + .KHR_timeline_semaphore = true, +#ifdef HK_USE_WSI_PLATFORM + .KHR_swapchain = true, + .KHR_swapchain_mutable_format = true, +#endif + .KHR_synchronization2 = true, + .KHR_uniform_buffer_standard_layout = true, + .KHR_variable_pointers = true, + .KHR_vertex_attribute_divisor = true, + .KHR_vulkan_memory_model = true, + .KHR_workgroup_memory_explicit_layout = true, + .KHR_zero_initialize_workgroup_memory = true, + .EXT_4444_formats = true, + .EXT_attachment_feedback_loop_layout = true, + .EXT_border_color_swizzle = true, + .EXT_buffer_device_address = true, + .EXT_calibrated_timestamps = false, + .EXT_conditional_rendering = false, + .EXT_color_write_enable = true, + .EXT_custom_border_color = true, + .EXT_depth_bias_control = false, + .EXT_depth_clip_control = false, + .EXT_depth_clip_enable = true, + .EXT_descriptor_indexing = true, +#ifdef VK_USE_PLATFORM_DISPLAY_KHR + .EXT_display_control = false, +#endif + .EXT_dynamic_rendering_unused_attachments = true, + .EXT_extended_dynamic_state = true, + .EXT_extended_dynamic_state2 = true, + .EXT_extended_dynamic_state3 = true, + .EXT_external_memory_dma_buf = true, + // TODO + .EXT_global_priority = false, + // TODO + .EXT_global_priority_query = false, + .EXT_graphics_pipeline_library = true, + .EXT_host_query_reset = true, + .EXT_host_image_copy = true, + .EXT_image_2d_view_of_3d = true, + .EXT_image_robustness = true, + .EXT_image_sliced_view_of_3d = false, + .EXT_image_view_min_lod = false, + .EXT_index_type_uint8 = true, + .EXT_inline_uniform_block = true, + .EXT_line_rasterization = true, + .EXT_load_store_op_none = true, + .EXT_map_memory_placed = false, + .EXT_memory_budget = false, + .EXT_multi_draw = true, + .EXT_mutable_descriptor_type = true, + .EXT_non_seamless_cube_map = true, + .EXT_pipeline_creation_cache_control = true, + .EXT_pipeline_creation_feedback = true, + .EXT_pipeline_protected_access = true, + .EXT_pipeline_robustness = true, + .EXT_physical_device_drm = true, + .EXT_primitive_topology_list_restart = true, + .EXT_private_data = true, + .EXT_primitives_generated_query = false, + .EXT_provoking_vertex = true, + .EXT_robustness2 = true, + .EXT_sample_locations = true, + .EXT_sampler_filter_minmax = false, + .EXT_scalar_block_layout = true, + .EXT_separate_stencil_usage = true, + .EXT_shader_image_atomic_int64 = false, + .EXT_shader_demote_to_helper_invocation = true, + .EXT_shader_module_identifier = true, + .EXT_shader_object = true, + .EXT_shader_replicated_composites = true, + .EXT_shader_stencil_export = true, + .EXT_shader_subgroup_ballot = true, + .EXT_shader_subgroup_vote = true, + .EXT_shader_viewport_index_layer = true, + .EXT_subgroup_size_control = true, +#ifdef HK_USE_WSI_PLATFORM + .EXT_swapchain_maintenance1 = true, +#endif + .EXT_texel_buffer_alignment = true, + .EXT_tooling_info = true, + .EXT_transform_feedback = true, + .EXT_vertex_attribute_divisor = true, + .EXT_vertex_input_dynamic_state = true, + .EXT_ycbcr_2plane_444_formats = false, + .EXT_ycbcr_image_arrays = false, + .GOOGLE_decorate_string = true, + .GOOGLE_hlsl_functionality1 = true, + .GOOGLE_user_type = true, + .VALVE_mutable_descriptor_type = true, + }; +} + +static void +hk_get_device_features( + const struct vk_device_extension_table *supported_extensions, + struct vk_features *features) +{ + *features = (struct vk_features){ + /* Vulkan 1.0 */ + .robustBufferAccess = true, + .fullDrawIndexUint32 = true, + .imageCubeArray = true, + .independentBlend = true, + .geometryShader = true, + .tessellationShader = true, + .sampleRateShading = true, + .dualSrcBlend = true, + .logicOp = true, + .multiDrawIndirect = true, + .drawIndirectFirstInstance = true, + .depthClamp = true, + .depthBiasClamp = true, + .fillModeNonSolid = true, + .depthBounds = false, + .wideLines = true, + .largePoints = true, + .alphaToOne = true, + .multiViewport = true, + .samplerAnisotropy = true, + .textureCompressionETC2 = false, + .textureCompressionBC = true, + .textureCompressionASTC_LDR = false, + .occlusionQueryPrecise = true, + .pipelineStatisticsQuery = true, + .vertexPipelineStoresAndAtomics = true, + .fragmentStoresAndAtomics = true, + .shaderTessellationAndGeometryPointSize = true, + .shaderImageGatherExtended = true, + .shaderStorageImageExtendedFormats = true, + /* TODO: hitting the vertex shader timeout in CTS, but should work */ + .shaderStorageImageMultisample = false, + .shaderStorageImageReadWithoutFormat = true, + .shaderStorageImageWriteWithoutFormat = true, + .shaderUniformBufferArrayDynamicIndexing = true, + .shaderSampledImageArrayDynamicIndexing = true, + .shaderStorageBufferArrayDynamicIndexing = true, + .shaderStorageImageArrayDynamicIndexing = true, + .shaderClipDistance = true, + .shaderCullDistance = true, + .shaderFloat64 = false, + .shaderInt64 = true, + .shaderInt16 = true, + .shaderResourceResidency = false, + .shaderResourceMinLod = false, + .sparseBinding = false, + .sparseResidency2Samples = false, + .sparseResidency4Samples = false, + .sparseResidency8Samples = false, + .sparseResidencyAliased = false, + .sparseResidencyBuffer = false, + .sparseResidencyImage2D = false, + .sparseResidencyImage3D = false, + .variableMultisampleRate = false, + .inheritedQueries = true, + + /* Vulkan 1.1 */ + .storageBuffer16BitAccess = true, + .uniformAndStorageBuffer16BitAccess = true, + .storagePushConstant16 = true, + .storageInputOutput16 = false, + .multiview = true, + .multiviewGeometryShader = false, + .multiviewTessellationShader = false, + .variablePointersStorageBuffer = true, + .variablePointers = true, + .shaderDrawParameters = true, + .samplerYcbcrConversion = true, + + /* Vulkan 1.2 */ + .samplerMirrorClampToEdge = true, + .drawIndirectCount = false, + .storageBuffer8BitAccess = true, + .uniformAndStorageBuffer8BitAccess = true, + .storagePushConstant8 = true, + .shaderBufferInt64Atomics = false, + .shaderSharedInt64Atomics = false, + .shaderFloat16 = true, + .shaderInt8 = true, + .descriptorIndexing = true, + .shaderInputAttachmentArrayDynamicIndexing = true, + .shaderUniformTexelBufferArrayDynamicIndexing = true, + .shaderStorageTexelBufferArrayDynamicIndexing = true, + .shaderUniformBufferArrayNonUniformIndexing = true, + .shaderSampledImageArrayNonUniformIndexing = true, + .shaderStorageBufferArrayNonUniformIndexing = true, + .shaderStorageImageArrayNonUniformIndexing = true, + .shaderInputAttachmentArrayNonUniformIndexing = true, + .shaderUniformTexelBufferArrayNonUniformIndexing = true, + .shaderStorageTexelBufferArrayNonUniformIndexing = true, + .descriptorBindingUniformBufferUpdateAfterBind = true, + .descriptorBindingSampledImageUpdateAfterBind = true, + .descriptorBindingStorageImageUpdateAfterBind = true, + .descriptorBindingStorageBufferUpdateAfterBind = true, + .descriptorBindingUniformTexelBufferUpdateAfterBind = true, + .descriptorBindingStorageTexelBufferUpdateAfterBind = true, + .descriptorBindingUpdateUnusedWhilePending = true, + .descriptorBindingPartiallyBound = true, + .descriptorBindingVariableDescriptorCount = true, + .runtimeDescriptorArray = true, + .samplerFilterMinmax = false, + .scalarBlockLayout = true, + .imagelessFramebuffer = true, + .uniformBufferStandardLayout = true, + .shaderSubgroupExtendedTypes = true, + .separateDepthStencilLayouts = true, + .hostQueryReset = true, + .timelineSemaphore = true, + .bufferDeviceAddress = true, + .bufferDeviceAddressCaptureReplay = false, + .bufferDeviceAddressMultiDevice = false, + .vulkanMemoryModel = true, + .vulkanMemoryModelDeviceScope = true, + .vulkanMemoryModelAvailabilityVisibilityChains = false, + .shaderOutputViewportIndex = true, + .shaderOutputLayer = true, + .subgroupBroadcastDynamicId = true, + + /* Vulkan 1.3 */ + .robustImageAccess = true, + .inlineUniformBlock = true, + .descriptorBindingInlineUniformBlockUpdateAfterBind = true, + .pipelineCreationCacheControl = true, + .privateData = true, + .shaderDemoteToHelperInvocation = true, + .shaderTerminateInvocation = true, + .subgroupSizeControl = true, + .computeFullSubgroups = true, + .synchronization2 = true, + .shaderZeroInitializeWorkgroupMemory = true, + .dynamicRendering = true, + .shaderIntegerDotProduct = true, + .maintenance4 = true, + + /* VK_KHR_dynamic_rendering_local_read */ + .dynamicRenderingLocalRead = true, + + /* VK_KHR_fragment_shader_barycentric */ + .fragmentShaderBarycentric = false, + + /* VK_KHR_global_priority */ + .globalPriorityQuery = true, + + /* VK_KHR_index_type_uint8 */ + .indexTypeUint8 = true, + + /* VK_KHR_line_rasterization */ + .rectangularLines = false, + .bresenhamLines = true, + .smoothLines = false, + .stippledRectangularLines = false, + .stippledBresenhamLines = false, + .stippledSmoothLines = false, + + /* VK_KHR_maintenance5 */ + .maintenance5 = true, + + /* VK_KHR_maintenance6 */ + .maintenance6 = true, + + /* VK_KHR_pipeline_executable_properties */ + .pipelineExecutableInfo = true, + + /* VK_KHR_present_id */ + .presentId = false, + + /* VK_KHR_present_wait */ + .presentWait = false, + + /* VK_KHR_shader_clock */ + .shaderSubgroupClock = false, + .shaderDeviceClock = false, + + /* VK_KHR_shader_expect_assume */ + .shaderExpectAssume = true, + + /* VK_KHR_shader_float_controls2 */ + .shaderFloatControls2 = true, + + /* VK_KHR_shader_maximal_reconvergence */ + .shaderMaximalReconvergence = true, + + /* VK_KHR_shader_subgroup_rotate */ + .shaderSubgroupRotate = true, + .shaderSubgroupRotateClustered = true, + + /* VK_KHR_vertex_attribute_divisor */ + .vertexAttributeInstanceRateDivisor = true, + .vertexAttributeInstanceRateZeroDivisor = true, + + /* VK_KHR_workgroup_memory_explicit_layout */ + .workgroupMemoryExplicitLayout = true, + .workgroupMemoryExplicitLayoutScalarBlockLayout = true, + .workgroupMemoryExplicitLayout8BitAccess = true, + .workgroupMemoryExplicitLayout16BitAccess = true, + + /* VK_EXT_4444_formats */ + .formatA4R4G4B4 = true, + .formatA4B4G4R4 = true, + + /* VK_EXT_attachment_feedback_loop_layout */ + .attachmentFeedbackLoopLayout = true, + + /* VK_EXT_border_color_swizzle */ + .borderColorSwizzle = true, + .borderColorSwizzleFromImage = false, + + /* VK_EXT_buffer_device_address */ + .bufferDeviceAddressCaptureReplayEXT = false, + + /* VK_EXT_color_write_enable */ + .colorWriteEnable = true, + + /* VK_EXT_conditional_rendering */ + .conditionalRendering = false, + .inheritedConditionalRendering = false, + + /* VK_EXT_custom_border_color */ + .customBorderColors = true, + .customBorderColorWithoutFormat = true, + + /* VK_EXT_depth_bias_control */ + .depthBiasControl = false, + .leastRepresentableValueForceUnormRepresentation = false, + .floatRepresentation = false, + .depthBiasExact = false, + + /* VK_EXT_depth_clip_control */ + .depthClipControl = false, + + /* VK_EXT_depth_clip_enable */ + .depthClipEnable = true, + + /* VK_EXT_dynamic_rendering_unused_attachments */ + .dynamicRenderingUnusedAttachments = true, + + /* VK_EXT_extended_dynamic_state */ + .extendedDynamicState = true, + + /* VK_EXT_extended_dynamic_state2 */ + .extendedDynamicState2 = true, + .extendedDynamicState2LogicOp = true, + .extendedDynamicState2PatchControlPoints = false, + + /* VK_EXT_extended_dynamic_state3 */ + .extendedDynamicState3TessellationDomainOrigin = false, + .extendedDynamicState3DepthClampEnable = true, + .extendedDynamicState3PolygonMode = true, + .extendedDynamicState3RasterizationSamples = true, + .extendedDynamicState3SampleMask = true, + .extendedDynamicState3AlphaToCoverageEnable = true, + .extendedDynamicState3AlphaToOneEnable = true, + .extendedDynamicState3LogicOpEnable = true, + .extendedDynamicState3ColorBlendEnable = true, + .extendedDynamicState3ColorBlendEquation = true, + .extendedDynamicState3ColorWriteMask = true, + .extendedDynamicState3RasterizationStream = false, + .extendedDynamicState3ConservativeRasterizationMode = false, + .extendedDynamicState3ExtraPrimitiveOverestimationSize = false, + .extendedDynamicState3DepthClipEnable = true, + .extendedDynamicState3SampleLocationsEnable = false, + .extendedDynamicState3ColorBlendAdvanced = false, + .extendedDynamicState3ProvokingVertexMode = true, + .extendedDynamicState3LineRasterizationMode = true, + .extendedDynamicState3LineStippleEnable = false, + .extendedDynamicState3DepthClipNegativeOneToOne = false, + .extendedDynamicState3ViewportWScalingEnable = false, + .extendedDynamicState3ViewportSwizzle = false, + .extendedDynamicState3CoverageToColorEnable = false, + .extendedDynamicState3CoverageToColorLocation = false, + .extendedDynamicState3CoverageModulationMode = false, + .extendedDynamicState3CoverageModulationTableEnable = false, + .extendedDynamicState3CoverageModulationTable = false, + .extendedDynamicState3CoverageReductionMode = false, + .extendedDynamicState3RepresentativeFragmentTestEnable = false, + .extendedDynamicState3ShadingRateImageEnable = false, + + /* VK_EXT_graphics_pipeline_library */ + .graphicsPipelineLibrary = true, + + /* VK_EXT_host_image_copy */ + .hostImageCopy = true, + + /* VK_EXT_image_2d_view_of_3d */ + .image2DViewOf3D = true, + .sampler2DViewOf3D = true, + + /* VK_EXT_image_sliced_view_of_3d */ + .imageSlicedViewOf3D = false, + +#ifdef HK_USE_WSI_PLATFORM + /* VK_EXT_swapchain_maintenance1 */ + .swapchainMaintenance1 = false, +#endif + + /* VK_EXT_image_view_min_lod */ + .minLod = false, + + /* VK_EXT_map_memory_placed */ + .memoryMapPlaced = false, + .memoryMapRangePlaced = false, + .memoryUnmapReserve = false, + + /* VK_EXT_multi_draw */ + .multiDraw = true, + + /* VK_EXT_mutable_descriptor_type */ + .mutableDescriptorType = true, + + /* VK_EXT_non_seamless_cube_map */ + .nonSeamlessCubeMap = true, + + /* VK_EXT_pipeline_protected_access */ + .pipelineProtectedAccess = true, + + /* VK_EXT_pipeline_robustness */ + .pipelineRobustness = true, + + /* VK_EXT_primitive_topology_list_restart */ + .primitiveTopologyListRestart = true, + .primitiveTopologyPatchListRestart = false, + + /* VK_EXT_primitives_generated_query */ + .primitivesGeneratedQuery = false, + .primitivesGeneratedQueryWithNonZeroStreams = false, + .primitivesGeneratedQueryWithRasterizerDiscard = false, + + /* VK_EXT_provoking_vertex */ + .provokingVertexLast = true, + .transformFeedbackPreservesProvokingVertex = true, + + /* VK_EXT_robustness2 */ + .robustBufferAccess2 = true, + .robustImageAccess2 = true, + .nullDescriptor = true, + + /* VK_EXT_shader_image_atomic_int64 */ + .shaderImageInt64Atomics = false, + .sparseImageInt64Atomics = false, + + /* VK_EXT_shader_module_identifier */ + .shaderModuleIdentifier = true, + + /* VK_EXT_shader_object */ + .shaderObject = true, + + /* VK_EXT_shader_replicated_composites */ + .shaderReplicatedComposites = true, + + /* VK_KHR_shader_subgroup_uniform_control_flow */ + .shaderSubgroupUniformControlFlow = true, + + /* VK_EXT_texel_buffer_alignment */ + .texelBufferAlignment = true, + + /* VK_EXT_transform_feedback */ + .transformFeedback = true, + .geometryStreams = true, + + /* VK_EXT_vertex_input_dynamic_state */ + .vertexInputDynamicState = true, + + /* VK_EXT_ycbcr_2plane_444_formats */ + .ycbcr2plane444Formats = false, + + /* VK_EXT_ycbcr_image_arrays */ + .ycbcrImageArrays = false, + }; +} + +static void +hk_get_device_properties(const struct agx_device *dev, + const struct hk_instance *instance, + struct vk_properties *properties) +{ + const VkSampleCountFlagBits sample_counts = + VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT; + + uint64_t os_page_size = 16384; + os_get_page_size(&os_page_size); + + *properties = (struct vk_properties){ + .apiVersion = hk_get_vk_version(), + .driverVersion = vk_get_driver_version(), + .vendorID = instance->force_vk_vendor ?: VK_VENDOR_ID_MESA, + .deviceID = 0, + .deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU, + + /* Vulkan 1.0 limits */ + .maxImageDimension1D = 16384, + .maxImageDimension2D = 16384, + .maxImageDimension3D = 16384, + .maxImageDimensionCube = 16384, + .maxImageArrayLayers = 2048, + .maxTexelBufferElements = AGX_TEXTURE_BUFFER_MAX_SIZE, + .maxUniformBufferRange = 65536, + .maxStorageBufferRange = UINT32_MAX, + .maxPushConstantsSize = HK_MAX_PUSH_SIZE, + .maxMemoryAllocationCount = 4096, + .maxSamplerAllocationCount = 4000, + .bufferImageGranularity = 0x400, + .sparseAddressSpaceSize = HK_SPARSE_ADDR_SPACE_SIZE, + .maxBoundDescriptorSets = HK_MAX_SETS, + .maxPerStageDescriptorSamplers = HK_MAX_DESCRIPTORS, + .maxPerStageDescriptorUniformBuffers = HK_MAX_DESCRIPTORS, + .maxPerStageDescriptorStorageBuffers = HK_MAX_DESCRIPTORS, + .maxPerStageDescriptorSampledImages = HK_MAX_DESCRIPTORS, + .maxPerStageDescriptorStorageImages = HK_MAX_DESCRIPTORS, + .maxPerStageDescriptorInputAttachments = HK_MAX_DESCRIPTORS, + .maxPerStageResources = UINT32_MAX, + .maxDescriptorSetSamplers = HK_MAX_DESCRIPTORS, + .maxDescriptorSetUniformBuffers = HK_MAX_DESCRIPTORS, + .maxDescriptorSetUniformBuffersDynamic = HK_MAX_DYNAMIC_BUFFERS / 2, + .maxDescriptorSetStorageBuffers = HK_MAX_DESCRIPTORS, + .maxDescriptorSetStorageBuffersDynamic = HK_MAX_DYNAMIC_BUFFERS / 2, + .maxDescriptorSetSampledImages = HK_MAX_DESCRIPTORS, + .maxDescriptorSetStorageImages = HK_MAX_DESCRIPTORS, + .maxDescriptorSetInputAttachments = HK_MAX_DESCRIPTORS, + .maxVertexInputAttributes = AGX_MAX_VBUFS, + .maxVertexInputBindings = AGX_MAX_ATTRIBS, + .maxVertexInputAttributeOffset = 65535, + .maxVertexInputBindingStride = 2048, + .maxVertexOutputComponents = 64, + .maxGeometryShaderInvocations = 32, + .maxGeometryInputComponents = 128, + .maxGeometryOutputComponents = 128, + .maxGeometryOutputVertices = 1024, + .maxGeometryTotalOutputComponents = 1024, + .maxTessellationGenerationLevel = 64, + .maxTessellationPatchSize = 32, + .maxTessellationControlPerVertexInputComponents = 128, + .maxTessellationControlPerVertexOutputComponents = 128, + .maxTessellationControlPerPatchOutputComponents = 120, + .maxTessellationControlTotalOutputComponents = 4216, + .maxTessellationEvaluationInputComponents = 128, + .maxTessellationEvaluationOutputComponents = 128, + .maxFragmentInputComponents = 64, + .maxFragmentOutputAttachments = HK_MAX_RTS, + .maxFragmentDualSrcAttachments = 1, + .maxFragmentCombinedOutputResources = 16, + .maxComputeSharedMemorySize = HK_MAX_SHARED_SIZE, + .maxComputeWorkGroupCount = {0x7fffffff, 65535, 65535}, + .maxComputeWorkGroupInvocations = 1024, + .maxComputeWorkGroupSize = {1024, 1024, 64}, + .subPixelPrecisionBits = 8, + .subTexelPrecisionBits = 8, + .mipmapPrecisionBits = 8, + .maxDrawIndexedIndexValue = UINT32_MAX, + .maxDrawIndirectCount = UINT32_MAX, + .maxSamplerLodBias = 15, + .maxSamplerAnisotropy = 16, + .maxViewports = HK_MAX_VIEWPORTS, + .maxViewportDimensions = {32768, 32768}, + .viewportBoundsRange = {-65536, 65536}, + .viewportSubPixelBits = 8, + .minMemoryMapAlignment = os_page_size, + .minTexelBufferOffsetAlignment = HK_MIN_TEXEL_BUFFER_ALIGNMENT, + .minUniformBufferOffsetAlignment = HK_MIN_UBO_ALIGNMENT, + .minStorageBufferOffsetAlignment = HK_MIN_SSBO_ALIGNMENT, + .minTexelOffset = -8, + .maxTexelOffset = 7, + .minTexelGatherOffset = -8, + .maxTexelGatherOffset = 7, + .minInterpolationOffset = -0.5, + .maxInterpolationOffset = 0.4375, + .subPixelInterpolationOffsetBits = 4, + .maxFramebufferHeight = 16384, + .maxFramebufferWidth = 16384, + .maxFramebufferLayers = 2048, + .framebufferColorSampleCounts = sample_counts, + .framebufferDepthSampleCounts = sample_counts, + .framebufferNoAttachmentsSampleCounts = sample_counts, + .framebufferStencilSampleCounts = sample_counts, + .maxColorAttachments = HK_MAX_RTS, + .sampledImageColorSampleCounts = sample_counts, + .sampledImageIntegerSampleCounts = sample_counts, + .sampledImageDepthSampleCounts = sample_counts, + .sampledImageStencilSampleCounts = sample_counts, + .storageImageSampleCounts = sample_counts, + .maxSampleMaskWords = 1, + .timestampComputeAndGraphics = false, + .timestampPeriod = 1, + .maxClipDistances = 8, + .maxCullDistances = 8, + .maxCombinedClipAndCullDistances = 8, + .discreteQueuePriorities = 2, + .pointSizeRange = {1.0, 512.f - 0.0625f}, + .lineWidthRange = {1.0, 16.0f}, + .pointSizeGranularity = 0.0625, + .lineWidthGranularity = 1.0f / 16.0f, + .strictLines = false, + .standardSampleLocations = true, + .optimalBufferCopyOffsetAlignment = 1, + .optimalBufferCopyRowPitchAlignment = 1, + .nonCoherentAtomSize = 64, + + /* Vulkan 1.0 sparse properties */ + .sparseResidencyNonResidentStrict = false, + .sparseResidencyAlignedMipSize = false, + .sparseResidencyStandard2DBlockShape = false, + .sparseResidencyStandard2DMultisampleBlockShape = false, + .sparseResidencyStandard3DBlockShape = false, + + /* Vulkan 1.1 properties */ + .subgroupSize = 32, + .subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT | + VK_SHADER_STAGE_FRAGMENT_BIT | + VK_SHADER_STAGE_VERTEX_BIT, + .subgroupSupportedOperations = + VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT | + VK_SUBGROUP_FEATURE_VOTE_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT | + VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR | + VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | + VK_SUBGROUP_FEATURE_CLUSTERED_BIT | + VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR, + .subgroupQuadOperationsInAllStages = true, + .pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY, + .maxMultiviewViewCount = HK_MAX_MULTIVIEW_VIEW_COUNT, + .maxMultiviewInstanceIndex = UINT32_MAX, + .maxPerSetDescriptors = UINT32_MAX, + .maxMemoryAllocationSize = (1u << 31), + + /* Vulkan 1.2 properties */ + .supportedDepthResolveModes = + VK_RESOLVE_MODE_SAMPLE_ZERO_BIT | VK_RESOLVE_MODE_AVERAGE_BIT | + VK_RESOLVE_MODE_MIN_BIT | VK_RESOLVE_MODE_MAX_BIT, + .supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT | + VK_RESOLVE_MODE_MIN_BIT | + VK_RESOLVE_MODE_MAX_BIT, + .independentResolveNone = true, + .independentResolve = true, + .driverID = VK_DRIVER_ID_MESA_HONEYKRISP, + .conformanceVersion = (VkConformanceVersion){1, 3, 8, 3}, + .denormBehaviorIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL, + .roundingModeIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL, + .shaderSignedZeroInfNanPreserveFloat16 = true, + .shaderSignedZeroInfNanPreserveFloat32 = true, + .shaderSignedZeroInfNanPreserveFloat64 = false, + .shaderDenormPreserveFloat16 = true, + .shaderDenormPreserveFloat32 = false, + .shaderDenormPreserveFloat64 = false, + .shaderDenormFlushToZeroFloat16 = false, + .shaderDenormFlushToZeroFloat32 = true, + .shaderDenormFlushToZeroFloat64 = false, + .shaderRoundingModeRTEFloat16 = true, + .shaderRoundingModeRTEFloat32 = true, + .shaderRoundingModeRTEFloat64 = false, + .shaderRoundingModeRTZFloat16 = false, + .shaderRoundingModeRTZFloat32 = false, + .shaderRoundingModeRTZFloat64 = false, + .maxUpdateAfterBindDescriptorsInAllPools = UINT32_MAX, + .shaderUniformBufferArrayNonUniformIndexingNative = true, + .shaderSampledImageArrayNonUniformIndexingNative = true, + .shaderStorageBufferArrayNonUniformIndexingNative = true, + .shaderStorageImageArrayNonUniformIndexingNative = true, + .shaderInputAttachmentArrayNonUniformIndexingNative = true, + .robustBufferAccessUpdateAfterBind = true, + .quadDivergentImplicitLod = false, + .maxPerStageDescriptorUpdateAfterBindSamplers = HK_MAX_DESCRIPTORS, + .maxPerStageDescriptorUpdateAfterBindUniformBuffers = HK_MAX_DESCRIPTORS, + .maxPerStageDescriptorUpdateAfterBindStorageBuffers = HK_MAX_DESCRIPTORS, + .maxPerStageDescriptorUpdateAfterBindSampledImages = HK_MAX_DESCRIPTORS, + .maxPerStageDescriptorUpdateAfterBindStorageImages = HK_MAX_DESCRIPTORS, + .maxPerStageDescriptorUpdateAfterBindInputAttachments = + HK_MAX_DESCRIPTORS, + .maxPerStageUpdateAfterBindResources = UINT32_MAX, + .maxDescriptorSetUpdateAfterBindSamplers = HK_MAX_DESCRIPTORS, + .maxDescriptorSetUpdateAfterBindUniformBuffers = HK_MAX_DESCRIPTORS, + .maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = + HK_MAX_DYNAMIC_BUFFERS / 2, + .maxDescriptorSetUpdateAfterBindStorageBuffers = HK_MAX_DESCRIPTORS, + .maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = + HK_MAX_DYNAMIC_BUFFERS / 2, + .maxDescriptorSetUpdateAfterBindSampledImages = HK_MAX_DESCRIPTORS, + .maxDescriptorSetUpdateAfterBindStorageImages = HK_MAX_DESCRIPTORS, + .maxDescriptorSetUpdateAfterBindInputAttachments = HK_MAX_DESCRIPTORS, + .filterMinmaxSingleComponentFormats = false, + .filterMinmaxImageComponentMapping = false, + .maxTimelineSemaphoreValueDifference = UINT64_MAX, + .framebufferIntegerColorSampleCounts = sample_counts, + + /* Vulkan 1.3 properties */ + .minSubgroupSize = 32, + .maxSubgroupSize = 32, + .maxComputeWorkgroupSubgroups = 1024 / 32, + .requiredSubgroupSizeStages = 0, + .maxInlineUniformBlockSize = 1 << 16, + .maxPerStageDescriptorInlineUniformBlocks = 32, + .maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 32, + .maxDescriptorSetInlineUniformBlocks = 6 * 32, + .maxDescriptorSetUpdateAfterBindInlineUniformBlocks = 6 * 32, + .maxInlineUniformTotalSize = 1 << 16, + .integerDotProduct4x8BitPackedUnsignedAccelerated = false, + .integerDotProduct4x8BitPackedSignedAccelerated = false, + .integerDotProduct4x8BitPackedMixedSignednessAccelerated = false, + .storageTexelBufferOffsetAlignmentBytes = HK_MIN_TEXEL_BUFFER_ALIGNMENT, + .storageTexelBufferOffsetSingleTexelAlignment = true, + .uniformTexelBufferOffsetAlignmentBytes = HK_MIN_TEXEL_BUFFER_ALIGNMENT, + .uniformTexelBufferOffsetSingleTexelAlignment = true, + .maxBufferSize = HK_MAX_BUFFER_SIZE, + + /* VK_KHR_push_descriptor */ + .maxPushDescriptors = HK_MAX_PUSH_DESCRIPTORS, + + /* VK_EXT_custom_border_color */ + .maxCustomBorderColorSamplers = 4000, + + /* VK_EXT_extended_dynamic_state3 */ + .dynamicPrimitiveTopologyUnrestricted = true, + + /* VK_EXT_graphics_pipeline_library */ + .graphicsPipelineLibraryFastLinking = true, + .graphicsPipelineLibraryIndependentInterpolationDecoration = true, + + /* VK_EXT_host_image_copy */ + + /* VK_KHR_line_rasterization */ + .lineSubPixelPrecisionBits = 8, + + /* VK_KHR_maintenance5 */ + .earlyFragmentMultisampleCoverageAfterSampleCounting = false, + .earlyFragmentSampleMaskTestBeforeSampleCounting = true, + .depthStencilSwizzleOneSupport = true, + .polygonModePointSize = false, + .nonStrictSinglePixelWideLinesUseParallelogram = false, + .nonStrictWideLinesUseParallelogram = false, + + /* VK_KHR_maintenance6 */ + .blockTexelViewCompatibleMultipleLayers = false, + .maxCombinedImageSamplerDescriptorCount = 3, + .fragmentShadingRateClampCombinerInputs = false, /* TODO */ + + /* VK_EXT_map_memory_placed */ + .minPlacedMemoryMapAlignment = os_page_size, + + /* VK_EXT_multi_draw */ + .maxMultiDrawCount = UINT16_MAX, + + /* VK_EXT_pipeline_robustness */ + .defaultRobustnessStorageBuffers = + VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT, + .defaultRobustnessUniformBuffers = + VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT, + .defaultRobustnessVertexInputs = + VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT, + .defaultRobustnessImages = + VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT, + + /* VK_EXT_physical_device_drm gets populated later */ + + /* VK_EXT_provoking_vertex */ + .provokingVertexModePerPipeline = true, + .transformFeedbackPreservesTriangleFanProvokingVertex = true, + + /* VK_EXT_robustness2 */ + .robustStorageBufferAccessSizeAlignment = HK_SSBO_BOUNDS_CHECK_ALIGNMENT, + .robustUniformBufferAccessSizeAlignment = HK_MIN_UBO_ALIGNMENT, + + /* VK_EXT_sample_locations */ + .sampleLocationSampleCounts = sample_counts, + .maxSampleLocationGridSize = (VkExtent2D){1, 1}, + .sampleLocationCoordinateRange[0] = 0.0f, + .sampleLocationCoordinateRange[1] = 0.9375f, + .sampleLocationSubPixelBits = 4, + .variableSampleLocations = false, + + /* VK_EXT_shader_object */ + .shaderBinaryVersion = 0, + + /* VK_EXT_transform_feedback */ + .maxTransformFeedbackStreams = 4, + .maxTransformFeedbackBuffers = 4, + .maxTransformFeedbackBufferSize = UINT32_MAX, + .maxTransformFeedbackStreamDataSize = 2048, + .maxTransformFeedbackBufferDataSize = 512, + .maxTransformFeedbackBufferDataStride = 2048, + .transformFeedbackQueries = true, + .transformFeedbackStreamsLinesTriangles = false, + .transformFeedbackRasterizationStreamSelect = false, + .transformFeedbackDraw = false, + + /* VK_KHR_vertex_attribute_divisor */ + .maxVertexAttribDivisor = UINT32_MAX, + .supportsNonZeroFirstInstance = true, + + /* VK_KHR_fragment_shader_barycentric */ + .triStripVertexOrderIndependentOfProvokingVertex = false, + }; + + strncpy(properties->deviceName, dev->name, sizeof(properties->deviceName)); + + /* VK_EXT_shader_module_identifier */ + static_assert(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) == + sizeof(properties->shaderModuleIdentifierAlgorithmUUID)); + memcpy(properties->shaderModuleIdentifierAlgorithmUUID, + vk_shaderModuleIdentifierAlgorithmUUID, + sizeof(properties->shaderModuleIdentifierAlgorithmUUID)); + + const struct { + uint16_t vendor_id; + uint16_t device_id; + uint8_t pad[12]; + } dev_uuid = { + .vendor_id = 0, + .device_id = 0, + }; + static_assert(sizeof(dev_uuid) == VK_UUID_SIZE); + memcpy(properties->deviceUUID, &dev_uuid, VK_UUID_SIZE); + static_assert(sizeof(instance->driver_build_sha) >= VK_UUID_SIZE); + memcpy(properties->driverUUID, instance->driver_build_sha, VK_UUID_SIZE); + + strncpy(properties->driverName, "Honeykrisp", VK_MAX_DRIVER_NAME_SIZE); + snprintf(properties->driverInfo, VK_MAX_DRIVER_INFO_SIZE, + "Mesa " PACKAGE_VERSION MESA_GIT_SHA1); + + /* We don't use the layouts ATM so just report all layouts from + * extensions that we support as compatible. + */ + static const VkImageLayout supported_layouts[] = { + VK_IMAGE_LAYOUT_GENERAL, /* required by spec */ + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, + VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_IMAGE_LAYOUT_PREINITIALIZED, + VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL, + VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL, + VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL, + VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL, + VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL, + VK_IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL, + VK_IMAGE_LAYOUT_READ_ONLY_OPTIMAL, + VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL, + // VK_IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT, + VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT, + }; + + properties->pCopySrcLayouts = (VkImageLayout *)supported_layouts; + properties->copySrcLayoutCount = ARRAY_SIZE(supported_layouts); + properties->pCopyDstLayouts = (VkImageLayout *)supported_layouts; + properties->copyDstLayoutCount = ARRAY_SIZE(supported_layouts); + + /* We're a UMR so we can always map every kind of memory */ + properties->identicalMemoryTypeRequirements = true; + + { + struct mesa_sha1 sha1_ctx; + uint8_t sha1[20]; + + _mesa_sha1_init(&sha1_ctx); + /* Make sure we don't match with other vendors */ + const char *driver = "honeykrisp-v1"; + _mesa_sha1_update(&sha1_ctx, driver, strlen(driver)); + _mesa_sha1_final(&sha1_ctx, sha1); + + memcpy(properties->optimalTilingLayoutUUID, sha1, VK_UUID_SIZE); + } +} + +static void +hk_physical_device_init_pipeline_cache(struct hk_physical_device *pdev) +{ + struct hk_instance *instance = hk_physical_device_instance(pdev); + + struct mesa_sha1 sha_ctx; + _mesa_sha1_init(&sha_ctx); + + _mesa_sha1_update(&sha_ctx, instance->driver_build_sha, + sizeof(instance->driver_build_sha)); + + const uint64_t compiler_flags = hk_physical_device_compiler_flags(pdev); + _mesa_sha1_update(&sha_ctx, &compiler_flags, sizeof(compiler_flags)); + + unsigned char sha[SHA1_DIGEST_LENGTH]; + _mesa_sha1_final(&sha_ctx, sha); + + static_assert(SHA1_DIGEST_LENGTH >= VK_UUID_SIZE); + memcpy(pdev->vk.properties.pipelineCacheUUID, sha, VK_UUID_SIZE); + memcpy(pdev->vk.properties.shaderBinaryUUID, sha, VK_UUID_SIZE); + +#ifdef ENABLE_SHADER_CACHE + char renderer[10]; + ASSERTED int len = snprintf(renderer, sizeof(renderer), "hk_g13g_"); + assert(len == sizeof(renderer) - 2); + + char timestamp[41]; + _mesa_sha1_format(timestamp, instance->driver_build_sha); + + const uint64_t driver_flags = hk_physical_device_compiler_flags(pdev); + pdev->vk.disk_cache = disk_cache_create(renderer, timestamp, driver_flags); +#endif +} + +static void +hk_physical_device_free_disk_cache(struct hk_physical_device *pdev) +{ +#ifdef ENABLE_SHADER_CACHE + if (pdev->vk.disk_cache) { + disk_cache_destroy(pdev->vk.disk_cache); + pdev->vk.disk_cache = NULL; + } +#else + assert(pdev->vk.disk_cache == NULL); +#endif +} + +static uint64_t +hk_get_sysmem_heap_size(void) +{ + uint64_t sysmem_size_B = 0; + if (!os_get_total_physical_memory(&sysmem_size_B)) + return 0; + + /* Use 3/4 of total size to avoid swapping */ + return ROUND_DOWN_TO(sysmem_size_B * 3 / 4, 1 << 20); +} + +static uint64_t +hk_get_sysmem_heap_available(struct hk_physical_device *pdev) +{ + uint64_t sysmem_size_B = 0; + if (!os_get_available_system_memory(&sysmem_size_B)) { + vk_loge(VK_LOG_OBJS(pdev), "Failed to query available system memory"); + return 0; + } + + /* Use 3/4 of available to avoid swapping */ + return ROUND_DOWN_TO(sysmem_size_B * 3 / 4, 1 << 20); +} + +VkResult +hk_create_drm_physical_device(struct vk_instance *_instance, + drmDevicePtr drm_device, + struct vk_physical_device **pdev_out) +{ + struct hk_instance *instance = (struct hk_instance *)_instance; + VkResult result; + + /* Blanket refusal to probe due to unstable UAPI. */ + return VK_ERROR_INCOMPATIBLE_DRIVER; + + if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER)) || + drm_device->bustype != DRM_BUS_PLATFORM) + return VK_ERROR_INCOMPATIBLE_DRIVER; + + const char *path = drm_device->nodes[DRM_NODE_RENDER]; + int fd = open(path, O_RDWR | O_CLOEXEC); + if (fd < 0) { + return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "failed to open device %s", path); + } + + drmVersionPtr version = drmGetVersion(fd); + if (!version) { + result = + vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "failed to query kernel driver version for device %s", path); + goto fail_fd; + } + + bool is_asahi = (strcmp(version->name, "asahi") == 0); + is_asahi |= strcmp(version->name, "virtio_gpu") == 0; + drmFreeVersion(version); + + if (!is_asahi) { + result = + vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "device %s does not use the asahi kernel driver", path); + goto fail_fd; + } + + struct stat st; + if (stat(drm_device->nodes[DRM_NODE_RENDER], &st)) { + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "fstat() failed on %s: %m", + drm_device->nodes[DRM_NODE_RENDER]); + goto fail_fd; + } + const dev_t render_dev = st.st_rdev; + + struct hk_physical_device *pdev = + vk_zalloc(&instance->vk.alloc, sizeof(*pdev), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + if (pdev == NULL) { + result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_fd; + } + + /* TODO: we're render-only, should we be reporting displays anyway in + * KHR_display? + */ + pdev->master_fd = -1; + +#if 0 + if (instance->vk.enabled_extensions.KHR_display) { + int master_fd = + open(drm_device->nodes[DRM_NODE_PRIMARY], O_RDWR | O_CLOEXEC); + + if (master_fd >= 0) { + struct stat st; + if (!stat(drm_device->nodes[DRM_NODE_PRIMARY], &st)) { + pdev->master_fd = master_fd; + properties.drmHasPrimary = true; + properties.drmPrimaryMajor = major(st.st_rdev); + properties.drmPrimaryMinor = minor(st.st_rdev); + } + } + } +#endif + + pdev->render_dev = render_dev; + pdev->dev.fd = fd; + + if (!agx_open_device(NULL, &pdev->dev)) { + result = vk_error(instance, VK_ERROR_UNKNOWN); + goto fail_pdev_alloc; + } + + struct vk_physical_device_dispatch_table dispatch_table; + vk_physical_device_dispatch_table_from_entrypoints( + &dispatch_table, &hk_physical_device_entrypoints, true); + vk_physical_device_dispatch_table_from_entrypoints( + &dispatch_table, &wsi_physical_device_entrypoints, false); + + struct vk_device_extension_table supported_extensions; + hk_get_device_extensions(instance, &supported_extensions); + + struct vk_features supported_features; + hk_get_device_features(&supported_extensions, &supported_features); + + struct vk_properties properties; + hk_get_device_properties(&pdev->dev, instance, &properties); + + properties.drmHasRender = true; + properties.drmRenderMajor = major(render_dev); + properties.drmRenderMinor = minor(render_dev); + + result = vk_physical_device_init(&pdev->vk, &instance->vk, + &supported_extensions, &supported_features, + &properties, &dispatch_table); + if (result != VK_SUCCESS) + goto fail_agx_device; + + hk_physical_device_init_pipeline_cache(pdev); + + uint64_t sysmem_size_B = hk_get_sysmem_heap_size(); + if (sysmem_size_B == 0) { + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "Failed to query total system memory"); + goto fail_disk_cache; + } + + uint32_t sysmem_heap_idx = pdev->mem_heap_count++; + pdev->mem_heaps[sysmem_heap_idx] = (struct hk_memory_heap){ + .size = sysmem_size_B, + .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, + .available = hk_get_sysmem_heap_available, + }; + + pdev->mem_types[pdev->mem_type_count++] = (VkMemoryType){ + .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT | + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + .heapIndex = sysmem_heap_idx, + }; + + assert(pdev->mem_heap_count <= ARRAY_SIZE(pdev->mem_heaps)); + assert(pdev->mem_type_count <= ARRAY_SIZE(pdev->mem_types)); + + /* TODO: VK_QUEUE_SPARSE_BINDING_BIT*/ + pdev->queue_families[pdev->queue_family_count++] = (struct hk_queue_family){ + .queue_flags = + VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT, + + .queue_count = 1, + }; + assert(pdev->queue_family_count <= ARRAY_SIZE(pdev->queue_families)); + + unsigned st_idx = 0; + pdev->syncobj_sync_type = vk_drm_syncobj_get_type(fd); + pdev->sync_types[st_idx++] = &pdev->syncobj_sync_type; + pdev->sync_types[st_idx++] = NULL; + assert(st_idx <= ARRAY_SIZE(pdev->sync_types)); + pdev->vk.supported_sync_types = pdev->sync_types; + + result = hk_init_wsi(pdev); + if (result != VK_SUCCESS) + goto fail_disk_cache; + + *pdev_out = &pdev->vk; + + return VK_SUCCESS; + +fail_disk_cache: + hk_physical_device_free_disk_cache(pdev); + vk_physical_device_finish(&pdev->vk); +fail_agx_device: + agx_close_device(&pdev->dev); +fail_pdev_alloc: + if (pdev->master_fd) + close(pdev->master_fd); + + vk_free(&pdev->vk.instance->alloc, pdev); +fail_fd: + close(fd); + return result; +} + +void +hk_physical_device_destroy(struct vk_physical_device *vk_pdev) +{ + struct hk_physical_device *pdev = + container_of(vk_pdev, struct hk_physical_device, vk); + + hk_finish_wsi(pdev); + + if (pdev->master_fd >= 0) + close(pdev->master_fd); + + hk_physical_device_free_disk_cache(pdev); + agx_close_device(&pdev->dev); + vk_physical_device_finish(&pdev->vk); + vk_free(&pdev->vk.instance->alloc, pdev); +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetPhysicalDeviceMemoryProperties2( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceMemoryProperties2 *pMemoryProperties) +{ + VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice); + + pMemoryProperties->memoryProperties.memoryHeapCount = pdev->mem_heap_count; + for (int i = 0; i < pdev->mem_heap_count; i++) { + pMemoryProperties->memoryProperties.memoryHeaps[i] = (VkMemoryHeap){ + .size = pdev->mem_heaps[i].size, + .flags = pdev->mem_heaps[i].flags, + }; + } + + pMemoryProperties->memoryProperties.memoryTypeCount = pdev->mem_type_count; + for (int i = 0; i < pdev->mem_type_count; i++) { + pMemoryProperties->memoryProperties.memoryTypes[i] = pdev->mem_types[i]; + } + + vk_foreach_struct(ext, pMemoryProperties->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT: { + VkPhysicalDeviceMemoryBudgetPropertiesEXT *p = (void *)ext; + + for (unsigned i = 0; i < pdev->mem_heap_count; i++) { + const struct hk_memory_heap *heap = &pdev->mem_heaps[i]; + uint64_t used = p_atomic_read(&heap->used); + + /* From the Vulkan 1.3.278 spec: + * + * "heapUsage is an array of VK_MAX_MEMORY_HEAPS VkDeviceSize + * values in which memory usages are returned, with one element + * for each memory heap. A heap’s usage is an estimate of how + * much memory the process is currently using in that heap." + * + * TODO: Include internal allocations? + */ + p->heapUsage[i] = used; + + uint64_t available = heap->size; + if (heap->available) + available = heap->available(pdev); + + /* From the Vulkan 1.3.278 spec: + * + * "heapBudget is an array of VK_MAX_MEMORY_HEAPS VkDeviceSize + * values in which memory budgets are returned, with one + * element for each memory heap. A heap’s budget is a rough + * estimate of how much memory the process can allocate from + * that heap before allocations may fail or cause performance + * degradation. The budget includes any currently allocated + * device memory." + * + * and + * + * "The heapBudget value must be less than or equal to + * VkMemoryHeap::size for each heap." + * + * available (queried above) is the total amount free memory + * system-wide and does not include our allocations so we need + * to add that in. + */ + uint64_t budget = MIN2(available + used, heap->size); + + /* Set the budget at 90% of available to avoid thrashing */ + p->heapBudget[i] = ROUND_DOWN_TO(budget * 9 / 10, 1 << 20); + } + + /* From the Vulkan 1.3.278 spec: + * + * "The heapBudget and heapUsage values must be zero for array + * elements greater than or equal to + * VkPhysicalDeviceMemoryProperties::memoryHeapCount. The + * heapBudget value must be non-zero for array elements less than + * VkPhysicalDeviceMemoryProperties::memoryHeapCount." + */ + for (unsigned i = pdev->mem_heap_count; i < VK_MAX_MEMORY_HEAPS; i++) { + p->heapBudget[i] = 0u; + p->heapUsage[i] = 0u; + } + break; + } + default: + vk_debug_ignored_stype(ext->sType); + break; + } + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetPhysicalDeviceQueueFamilyProperties2( + VkPhysicalDevice physicalDevice, uint32_t *pQueueFamilyPropertyCount, + VkQueueFamilyProperties2 *pQueueFamilyProperties) +{ + VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice); + VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out, pQueueFamilyProperties, + pQueueFamilyPropertyCount); + + for (uint8_t i = 0; i < pdev->queue_family_count; i++) { + const struct hk_queue_family *queue_family = &pdev->queue_families[i]; + + vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) + { + p->queueFamilyProperties.queueFlags = queue_family->queue_flags; + p->queueFamilyProperties.queueCount = queue_family->queue_count; + p->queueFamilyProperties.timestampValidBits = 0; // TODO 64; + p->queueFamilyProperties.minImageTransferGranularity = + (VkExtent3D){1, 1, 1}; + + vk_foreach_struct(ext, p->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR: { + VkQueueFamilyGlobalPriorityPropertiesKHR *props = (void *)ext; + + /* TODO: support multiple priorities */ + props->priorityCount = 1; + props->priorities[0] = VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_EXT; + break; + } + default: + break; + } + } + } + } +} + +static const VkTimeDomainKHR hk_time_domains[] = { + VK_TIME_DOMAIN_DEVICE_KHR, + VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR, +#ifdef CLOCK_MONOTONIC_RAW + VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR, +#endif +}; + +VKAPI_ATTR VkResult VKAPI_CALL +hk_GetPhysicalDeviceCalibrateableTimeDomainsKHR(VkPhysicalDevice physicalDevice, + uint32_t *pTimeDomainCount, + VkTimeDomainKHR *pTimeDomains) +{ + VK_OUTARRAY_MAKE_TYPED(VkTimeDomainKHR, out, pTimeDomains, pTimeDomainCount); + + for (int d = 0; d < ARRAY_SIZE(hk_time_domains); d++) { + vk_outarray_append_typed(VkTimeDomainKHR, &out, i) + { + *i = hk_time_domains[d]; + } + } + + return vk_outarray_status(&out); +} + +VKAPI_ATTR void VKAPI_CALL +hk_GetPhysicalDeviceMultisamplePropertiesEXT( + VkPhysicalDevice physicalDevice, VkSampleCountFlagBits samples, + VkMultisamplePropertiesEXT *pMultisampleProperties) +{ + VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice); + + if (samples & pdev->vk.properties.sampleLocationSampleCounts) { + pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){1, 1}; + } else { + pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){0, 0}; + } +} diff --git a/src/asahi/vulkan/hk_physical_device.h b/src/asahi/vulkan/hk_physical_device.h new file mode 100644 index 00000000000..8b8b318d8be --- /dev/null +++ b/src/asahi/vulkan/hk_physical_device.h @@ -0,0 +1,76 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "asahi/lib/agx_device.h" +#include +#include "hk_private.h" +#include "vk_physical_device.h" +#include "vk_sync.h" +#include "wsi_common.h" + +struct hk_instance; +struct hk_physical_device; + +struct hk_queue_family { + VkQueueFlags queue_flags; + uint32_t queue_count; +}; + +struct hk_memory_heap { + uint64_t size; + uint64_t used; + VkMemoryHeapFlags flags; + uint64_t (*available)(struct hk_physical_device *pdev); +}; + +struct hk_physical_device { + struct vk_physical_device vk; + dev_t render_dev; + int master_fd; + + /* Only used for VK_EXT_memory_budget */ + struct agx_device dev; + + struct wsi_device wsi_device; + + uint8_t device_uuid[VK_UUID_SIZE]; + + // TODO: add mapable VRAM heap if possible + struct hk_memory_heap mem_heaps[3]; + VkMemoryType mem_types[3]; + uint8_t mem_heap_count; + uint8_t mem_type_count; + + struct hk_queue_family queue_families[3]; + uint8_t queue_family_count; + + struct vk_sync_type syncobj_sync_type; + const struct vk_sync_type *sync_types[2]; +}; + +VK_DEFINE_HANDLE_CASTS(hk_physical_device, vk.base, VkPhysicalDevice, + VK_OBJECT_TYPE_PHYSICAL_DEVICE) + +static inline struct hk_instance * +hk_physical_device_instance(struct hk_physical_device *pdev) +{ + return (struct hk_instance *)pdev->vk.instance; +} + +VkResult hk_create_drm_physical_device(struct vk_instance *vk_instance, + struct _drmDevice *drm_device, + struct vk_physical_device **pdev_out); + +void hk_physical_device_destroy(struct vk_physical_device *vk_device); + +#if defined(VK_USE_PLATFORM_WAYLAND_KHR) || \ + defined(VK_USE_PLATFORM_XCB_KHR) || defined(VK_USE_PLATFORM_XLIB_KHR) || \ + defined(VK_USE_PLATFORM_DISPLAY_KHR) +#define HK_USE_WSI_PLATFORM +#endif diff --git a/src/asahi/vulkan/hk_private.h b/src/asahi/vulkan/hk_private.h new file mode 100644 index 00000000000..bd2b8d68f97 --- /dev/null +++ b/src/asahi/vulkan/hk_private.h @@ -0,0 +1,53 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include + +#include "vk_log.h" +#include "vk_util.h" + +#define HK_MAX_SETS 8 +#define HK_MAX_PUSH_SIZE 128 +#define HK_MAX_DYNAMIC_BUFFERS 64 +#define HK_MAX_RTS 8 +#define HK_MIN_SSBO_ALIGNMENT 16 +#define HK_MIN_TEXEL_BUFFER_ALIGNMENT 16 +#define HK_MIN_UBO_ALIGNMENT 64 +#define HK_MAX_VIEWPORTS 16 +#define HK_MAX_DESCRIPTOR_SIZE 32 +#define HK_MAX_PUSH_DESCRIPTORS 32 +#define HK_MAX_DESCRIPTOR_SET_SIZE (1u << 30) +#define HK_MAX_DESCRIPTORS (1 << 20) +#define HK_PUSH_DESCRIPTOR_SET_SIZE \ + (HK_MAX_PUSH_DESCRIPTORS * HK_MAX_DESCRIPTOR_SIZE) +#define HK_SSBO_BOUNDS_CHECK_ALIGNMENT 4 +#define HK_MAX_MULTIVIEW_VIEW_COUNT 32 + +#define HK_SPARSE_ADDR_SPACE_SIZE (1ull << 39) +#define HK_MAX_BUFFER_SIZE (1ull << 31) +#define HK_MAX_SHARED_SIZE (32 * 1024) + +struct hk_addr_range { + uint64_t addr; + uint64_t range; +}; + +#define perf_debug(dev, fmt, ...) \ + do { \ + if (dev->dev.debug & AGX_DBG_PERF) \ + mesa_log(MESA_LOG_WARN, (MESA_LOG_TAG), (fmt), ##__VA_ARGS__); \ + } while (0) + +/* Fake values, pending UAPI upstreaming */ +#ifndef DRM_FORMAT_MOD_APPLE_TWIDDLED +#define DRM_FORMAT_MOD_APPLE_TWIDDLED (2) +#endif +#ifndef DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED +#define DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED (3) +#endif diff --git a/src/asahi/vulkan/hk_query_pool.c b/src/asahi/vulkan/hk_query_pool.c new file mode 100644 index 00000000000..5762c69419c --- /dev/null +++ b/src/asahi/vulkan/hk_query_pool.c @@ -0,0 +1,580 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_query_pool.h" + +#include "agx_compile.h" +#include "agx_pack.h" +#include "hk_buffer.h" +#include "hk_cmd_buffer.h" +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_event.h" +#include "hk_physical_device.h" +#include "hk_shader.h" + +#include "shader_enums.h" +#include "vk_common_entrypoints.h" +#include "vk_meta.h" +#include "vk_pipeline.h" + +#include "asahi/lib/agx_bo.h" +#include "asahi/lib/libagx_shaders.h" +#include "asahi/lib/shaders/query.h" +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" + +#include "util/os_time.h" +#include "vulkan/vulkan_core.h" + +struct hk_query_report { + /* TODO: do we want this to be legit u64? */ + uint32_t value; + uint32_t padding; +}; + +static uint16_t * +hk_pool_oq_index_ptr(const struct hk_query_pool *pool) +{ + return (uint16_t *)(pool->bo->ptr.cpu + pool->query_start); +} + +static uint32_t +hk_reports_per_query(struct hk_query_pool *pool) +{ + switch (pool->vk.query_type) { + case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_TIMESTAMP: + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + return 1; + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + return util_bitcount(pool->vk.pipeline_statistics); + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + // Primitives succeeded and primitives needed + return 2; + default: + unreachable("Unsupported query type"); + } +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkQueryPool *pQueryPool) +{ + VK_FROM_HANDLE(hk_device, dev, device); + struct hk_query_pool *pool; + + bool occlusion = pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION; + unsigned occlusion_queries = occlusion ? pCreateInfo->queryCount : 0; + + pool = + vk_query_pool_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*pool)); + if (!pool) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + /* We place the availability first and then data */ + pool->query_start = align(pool->vk.query_count * sizeof(uint32_t), + sizeof(struct hk_query_report)); + + uint32_t reports_per_query = hk_reports_per_query(pool); + pool->query_stride = reports_per_query * sizeof(struct hk_query_report); + + if (pool->vk.query_count > 0) { + uint32_t bo_size = pool->query_start; + + /* For occlusion queries, we stick the query index remapping here */ + if (occlusion_queries) + bo_size += sizeof(uint16_t) * pool->vk.query_count; + else + bo_size += pool->query_stride * pool->vk.query_count; + + pool->bo = + agx_bo_create(&dev->dev, bo_size, AGX_BO_WRITEBACK, "Query pool"); + if (!pool->bo) { + hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator); + return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + } + + uint16_t *oq_index = hk_pool_oq_index_ptr(pool); + + for (unsigned i = 0; i < occlusion_queries; ++i) { + uint64_t zero = 0; + unsigned index; + + VkResult result = hk_descriptor_table_add( + dev, &dev->occlusion_queries, &zero, sizeof(uint64_t), &index); + + if (result != VK_SUCCESS) { + hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator); + return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + + /* We increment as we go so we can clean up properly if we run out */ + assert(pool->oq_queries < occlusion_queries); + oq_index[pool->oq_queries++] = index; + } + + *pQueryPool = hk_query_pool_to_handle(pool); + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_query_pool, pool, queryPool); + + if (!pool) + return; + + uint16_t *oq_index = hk_pool_oq_index_ptr(pool); + + for (unsigned i = 0; i < pool->oq_queries; ++i) { + hk_descriptor_table_remove(dev, &dev->occlusion_queries, oq_index[i]); + } + + agx_bo_unreference(pool->bo); + vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk); +} + +static uint64_t +hk_query_available_addr(struct hk_query_pool *pool, uint32_t query) +{ + assert(query < pool->vk.query_count); + return pool->bo->ptr.gpu + query * sizeof(uint32_t); +} + +static uint32_t * +hk_query_available_map(struct hk_query_pool *pool, uint32_t query) +{ + assert(query < pool->vk.query_count); + return (uint32_t *)pool->bo->ptr.cpu + query; +} + +static uint64_t +hk_query_offset(struct hk_query_pool *pool, uint32_t query) +{ + assert(query < pool->vk.query_count); + return pool->query_start + query * pool->query_stride; +} + +static uint64_t +hk_query_report_addr(struct hk_device *dev, struct hk_query_pool *pool, + uint32_t query) +{ + if (pool->oq_queries) { + uint16_t *oq_index = hk_pool_oq_index_ptr(pool); + return dev->occlusion_queries.bo->ptr.gpu + + (oq_index[query] * sizeof(uint64_t)); + } else { + return pool->bo->ptr.gpu + hk_query_offset(pool, query); + } +} + +static struct hk_query_report * +hk_query_report_map(struct hk_device *dev, struct hk_query_pool *pool, + uint32_t query) +{ + if (pool->oq_queries) { + uint64_t *queries = (uint64_t *)dev->occlusion_queries.bo->ptr.cpu; + uint16_t *oq_index = hk_pool_oq_index_ptr(pool); + + return (struct hk_query_report *)&queries[oq_index[query]]; + } else { + return (void *)((char *)pool->bo->ptr.cpu + hk_query_offset(pool, query)); + } +} + +struct hk_write_params { + uint64_t address; + uint32_t value; +}; + +static void +hk_nir_write_u32(nir_builder *b, UNUSED const void *key) +{ + nir_def *addr = nir_load_preamble( + b, 1, 64, .base = offsetof(struct hk_write_params, address) / 2); + + nir_def *value = nir_load_preamble( + b, 1, 32, .base = offsetof(struct hk_write_params, value) / 2); + + nir_store_global(b, addr, 4, value, nir_component_mask(1)); +} + +void +hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value, + bool after_gfx) +{ + struct hk_cs *cs = hk_cmd_buffer_get_cs_general( + cmd, after_gfx ? &cmd->current_cs.post_gfx : &cmd->current_cs.cs, true); + if (!cs) + return; + + hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */); + + /* As soon as we mark a query available, it needs to be available system + * wide, otherwise a CPU-side get result can query. As such, we cache flush + * before and then let coherency works its magic. Without this barrier, we + * get flakes in + * + * dEQP-VK.query_pool.occlusion_query.get_results_conservative_size_64_wait_query_without_availability_draw_triangles_discard + */ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + hk_cdm_cache_flush(dev, cs); + + struct hk_shader *s = hk_meta_kernel(dev, hk_nir_write_u32, NULL, 0); + struct hk_write_params params = {.address = address, .value = value}; + uint32_t usc = hk_upload_usc_words_kernel(cmd, s, ¶ms, sizeof(params)); + + hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1), hk_grid(1, 1, 1)); +} + +/** + * Goes through a series of consecutive query indices in the given pool, + * setting all element values to 0 and emitting them as available. + */ +static void +emit_zero_queries(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool, + uint32_t first_index, uint32_t num_queries, + bool set_available) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + for (uint32_t i = 0; i < num_queries; i++) { + uint64_t available = hk_query_available_addr(pool, first_index + i); + uint64_t report = hk_query_report_addr(dev, pool, first_index + i); + hk_queue_write(cmd, available, set_available, false); + + /* XXX: is this supposed to happen on the begin? */ + for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) { + hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)), 0, + false); + } + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_ResetQueryPool(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery, + uint32_t queryCount) +{ + VK_FROM_HANDLE(hk_query_pool, pool, queryPool); + VK_FROM_HANDLE(hk_device, dev, device); + + uint32_t *available = hk_query_available_map(pool, firstQuery); + struct hk_query_report *reports = hk_query_report_map(dev, pool, firstQuery); + + memset(available, 0, queryCount * sizeof(*available)); + memset(reports, 0, queryCount * pool->query_stride); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, + uint32_t firstQuery, uint32_t queryCount) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_query_pool, pool, queryPool); + + emit_zero_queries(cmd, pool, firstQuery, queryCount, false); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, + VkPipelineStageFlags2 stage, VkQueryPool queryPool, + uint32_t query) +{ + unreachable("todo"); +#if 0 + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_query_pool, pool, queryPool); + + struct nv_push *p = hk_cmd_buffer_push(cmd, 10); + + uint64_t report_addr = hk_query_report_addr(pool, query); + P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); + P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32); + P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr); + P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0); + P_NV9097_SET_REPORT_SEMAPHORE_D(p, { + .operation = OPERATION_REPORT_ONLY, + .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage), + .structure_size = STRUCTURE_SIZE_FOUR_WORDS, + }); + + uint64_t available_addr = hk_query_available_addr(pool, query); + P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); + P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32); + P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr); + P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1); + P_NV9097_SET_REPORT_SEMAPHORE_D(p, { + .operation = OPERATION_RELEASE, + .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE, + .pipeline_location = PIPELINE_LOCATION_ALL, + .structure_size = STRUCTURE_SIZE_ONE_WORD, + }); + + /* From the Vulkan spec: + * + * "If vkCmdWriteTimestamp2 is called while executing a render pass + * instance that has multiview enabled, the timestamp uses N consecutive + * query indices in the query pool (starting at query) where N is the + * number of bits set in the view mask of the subpass the command is + * executed in. The resulting query values are determined by an + * implementation-dependent choice of one of the following behaviors:" + * + * In our case, only the first query is used, so we emit zeros for the + * remaining queries, as described in the first behavior listed in the + * Vulkan spec: + * + * "The first query is a timestamp value and (if more than one bit is set + * in the view mask) zero is written to the remaining queries." + */ + if (cmd->state.gfx.render.view_mask != 0) { + const uint32_t num_queries = + util_bitcount(cmd->state.gfx.render.view_mask); + if (num_queries > 1) + emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true); + } +#endif +} + +static void +hk_cmd_begin_end_query(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool, + uint32_t query, uint32_t index, + VkQueryControlFlags flags, bool end) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + bool graphics = false; + + switch (pool->vk.query_type) { + case VK_QUERY_TYPE_OCCLUSION: { + assert(query < pool->oq_queries); + + if (end) { + cmd->state.gfx.occlusion.mode = AGX_VISIBILITY_MODE_NONE; + } else { + cmd->state.gfx.occlusion.mode = flags & VK_QUERY_CONTROL_PRECISE_BIT + ? AGX_VISIBILITY_MODE_COUNTING + : AGX_VISIBILITY_MODE_BOOLEAN; + } + + uint16_t *oq_index = hk_pool_oq_index_ptr(pool); + cmd->state.gfx.occlusion.index = oq_index[query]; + cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION; + break; + } + + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { + uint64_t addr = hk_query_report_addr(dev, pool, query); + cmd->state.gfx.xfb_query[index] = end ? 0 : addr; + break; + } + + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + struct hk_root_descriptor_table *root = &cmd->state.gfx.descriptors.root; + cmd->state.gfx.descriptors.root_dirty = true; + + root->draw.pipeline_stats = hk_query_report_addr(dev, pool, query); + root->draw.pipeline_stats_flags = pool->vk.pipeline_statistics; + + /* XXX: I don't think is correct... when does the query become available + * exactly? + */ + graphics = pool->vk.pipeline_statistics & + ~VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT; + break; + } + + default: + unreachable("Unsupported query type"); + } + + /* We need to set available=1 after the graphics work finishes. */ + if (end) { + hk_queue_write(cmd, hk_query_available_addr(pool, query), 1, graphics); + } +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, + uint32_t query, VkQueryControlFlags flags, + uint32_t index) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_query_pool, pool, queryPool); + + hk_cmd_begin_end_query(cmd, pool, query, index, flags, false); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, + uint32_t query, uint32_t index) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_query_pool, pool, queryPool); + + hk_cmd_begin_end_query(cmd, pool, query, index, 0, true); + + /* From the Vulkan spec: + * + * "If queries are used while executing a render pass instance that has + * multiview enabled, the query uses N consecutive query indices in + * the query pool (starting at query) where N is the number of bits set + * in the view mask in the subpass the query is used in. How the + * numerical results of the query are distributed among the queries is + * implementation-dependent." + * + * In our case, only the first query is used, so we emit zeros for the + * remaining queries. + */ + if (cmd->state.gfx.render.view_mask != 0) { + const uint32_t num_queries = + util_bitcount(cmd->state.gfx.render.view_mask); + if (num_queries > 1) + emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true); + } +} + +static bool +hk_query_is_available(struct hk_query_pool *pool, uint32_t query) +{ + uint32_t *available = hk_query_available_map(pool, query); + return p_atomic_read(available) != 0; +} + +#define HK_QUERY_TIMEOUT 2000000000ull + +static VkResult +hk_query_wait_for_available(struct hk_device *dev, struct hk_query_pool *pool, + uint32_t query) +{ + uint64_t abs_timeout_ns = os_time_get_absolute_timeout(HK_QUERY_TIMEOUT); + + while (os_time_get_nano() < abs_timeout_ns) { + if (hk_query_is_available(pool, query)) + return VK_SUCCESS; + + VkResult status = vk_device_check_status(&dev->vk); + if (status != VK_SUCCESS) + return status; + } + + return vk_device_set_lost(&dev->vk, "query timeout"); +} + +static void +cpu_write_query_result(void *dst, uint32_t idx, VkQueryResultFlags flags, + uint64_t result) +{ + if (flags & VK_QUERY_RESULT_64_BIT) { + uint64_t *dst64 = dst; + dst64[idx] = result; + } else { + uint32_t *dst32 = dst; + dst32[idx] = result; + } +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_GetQueryPoolResults(VkDevice device, VkQueryPool queryPool, + uint32_t firstQuery, uint32_t queryCount, + size_t dataSize, void *pData, VkDeviceSize stride, + VkQueryResultFlags flags) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_query_pool, pool, queryPool); + + if (vk_device_is_lost(&dev->vk)) + return VK_ERROR_DEVICE_LOST; + + VkResult status = VK_SUCCESS; + for (uint32_t i = 0; i < queryCount; i++) { + const uint32_t query = firstQuery + i; + + bool available = hk_query_is_available(pool, query); + + if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { + status = hk_query_wait_for_available(dev, pool, query); + if (status != VK_SUCCESS) + return status; + + available = true; + } + + bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT); + + const struct hk_query_report *src = hk_query_report_map(dev, pool, query); + assert(i * stride < dataSize); + void *dst = (char *)pData + i * stride; + + uint32_t reports = hk_reports_per_query(pool); + if (write_results) { + for (uint32_t j = 0; j < reports; j++) { + cpu_write_query_result(dst, j, flags, src[j].value); + } + } + + if (!write_results) + status = VK_NOT_READY; + + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) + cpu_write_query_result(dst, reports, flags, available); + } + + return status; +} + +static void +hk_nir_copy_query(nir_builder *b, UNUSED const void *key) +{ + nir_def *id = nir_channel(b, nir_load_workgroup_id(b), 0); + libagx_copy_query(b, nir_load_preamble(b, 1, 64), id); +} + +VKAPI_ATTR void VKAPI_CALL +hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, + uint32_t firstQuery, uint32_t queryCount, + VkBuffer dstBuffer, VkDeviceSize dstOffset, + VkDeviceSize stride, VkQueryResultFlags flags) +{ + VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); + VK_FROM_HANDLE(hk_query_pool, pool, queryPool); + VK_FROM_HANDLE(hk_buffer, dst_buffer, dstBuffer); + + struct hk_device *dev = hk_cmd_buffer_device(cmd); + struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true); + if (!cs) + return; + + hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */); + + const struct libagx_copy_query_push info = { + .availability = pool->bo->ptr.gpu, + .results = pool->oq_queries ? dev->occlusion_queries.bo->ptr.gpu + : pool->bo->ptr.gpu + pool->query_start, + .oq_index = pool->oq_queries ? pool->bo->ptr.gpu + pool->query_start : 0, + + .first_query = firstQuery, + .dst_addr = hk_buffer_address(dst_buffer, dstOffset), + .dst_stride = stride, + .reports_per_query = hk_reports_per_query(pool), + + .partial = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT, + ._64 = flags & VK_QUERY_RESULT_64_BIT, + .with_availability = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT, + }; + + uint64_t push = hk_pool_upload(cmd, &info, sizeof(info), 8); + + struct hk_shader *s = hk_meta_kernel(dev, hk_nir_copy_query, NULL, 0); + uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push)); + hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(queryCount, 1, 1), + hk_grid(1, 1, 1)); +} diff --git a/src/asahi/vulkan/hk_query_pool.h b/src/asahi/vulkan/hk_query_pool.h new file mode 100644 index 00000000000..9e235dfed08 --- /dev/null +++ b/src/asahi/vulkan/hk_query_pool.h @@ -0,0 +1,28 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "hk_private.h" +#include "vk_query_pool.h" + +struct agx_bo; + +struct hk_query_pool { + struct vk_query_pool vk; + + uint32_t query_start; + uint32_t query_stride; + + struct agx_bo *bo; + void *bo_map; + + unsigned oq_queries; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_query_pool, vk.base, VkQueryPool, + VK_OBJECT_TYPE_QUERY_POOL) diff --git a/src/asahi/vulkan/hk_queue.c b/src/asahi/vulkan/hk_queue.c new file mode 100644 index 00000000000..7cc1c8be139 --- /dev/null +++ b/src/asahi/vulkan/hk_queue.c @@ -0,0 +1,599 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_queue.h" + +#include "agx_bo.h" +#include "agx_device.h" +#include "agx_pack.h" +#include "decode.h" +#include "hk_cmd_buffer.h" +#include "hk_device.h" +#include "hk_physical_device.h" + +#include +#include "asahi/lib/unstable_asahi_drm.h" +#include "util/list.h" +#include "vulkan/vulkan_core.h" + +#include "vk_drm_syncobj.h" +#include "vk_sync.h" + +/* + * We need to specially handle submits with no control streams. The kernel + * can't accept empty submits, but we can end up here in Vulkan for + * synchronization purposes only. Rather than submit a no-op job (slow), + * we simply tie the fences together. + */ +static VkResult +queue_submit_empty(struct hk_device *dev, struct hk_queue *queue, + struct vk_queue_submit *submit) +{ + int fd = dev->dev.fd; + + /* Transfer the waits into the queue timeline. */ + for (unsigned i = 0; i < submit->wait_count; ++i) { + struct vk_sync_wait *wait = &submit->waits[i]; + + assert(vk_sync_type_is_drm_syncobj(wait->sync->type)); + const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(wait->sync); + + drmSyncobjTransfer(fd, queue->drm.syncobj, ++queue->drm.timeline_value, + syncobj->syncobj, wait->wait_value, 0); + } + + /* Transfer the queue timeline into each out fence. They will all be + * signalled when we reach this point. + */ + for (unsigned i = 0; i < submit->signal_count; ++i) { + struct vk_sync_signal *signal = &submit->signals[i]; + + assert(vk_sync_type_is_drm_syncobj(signal->sync->type)); + const struct vk_drm_syncobj *syncobj = + vk_sync_as_drm_syncobj(signal->sync); + + drmSyncobjTransfer(fd, syncobj->syncobj, signal->signal_value, + queue->drm.syncobj, queue->drm.timeline_value, 0); + } + + return VK_SUCCESS; +} + +static void +asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs, + struct drm_asahi_cmd_compute *cmd) +{ + size_t len = cs->stream_linked ? 65536 /* XXX */ : (cs->current - cs->start); + + *cmd = (struct drm_asahi_cmd_compute){ + .encoder_ptr = cs->addr, + .encoder_end = cs->addr + len, + + .sampler_array = dev->samplers.table.bo->ptr.gpu, + .sampler_count = dev->samplers.table.alloc, + .sampler_max = dev->samplers.table.alloc + 1, + + .encoder_id = agx_get_global_id(&dev->dev), + .cmd_id = agx_get_global_id(&dev->dev), + .unk_mask = 0xffffffff, + }; + + if (cs->scratch.cs.main || cs->scratch.cs.preamble) { + cmd->helper_arg = dev->scratch.cs.buf->ptr.gpu; + cmd->helper_cfg = cs->scratch.cs.preamble << 16; + cmd->helper_program = dev->dev.helper->ptr.gpu | 1; + } +} + +static void +asahi_fill_vdm_command(struct hk_device *dev, struct hk_cs *cs, + struct drm_asahi_cmd_render *c) +{ +#if 0 + bool clear_pipeline_textures = + agx_tilebuffer_spills(&batch->tilebuffer_layout); + + for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) { + struct pipe_surface *surf = batch->key.cbufs[i]; + + clear_pipeline_textures |= + surf && surf->texture && !(batch->clear & (PIPE_CLEAR_COLOR0 << i)); + } + +#endif + unsigned cmd_ta_id = agx_get_global_id(&dev->dev); + unsigned cmd_3d_id = agx_get_global_id(&dev->dev); + unsigned encoder_id = agx_get_global_id(&dev->dev); + + memset(c, 0, sizeof(*c)); + + c->encoder_ptr = cs->addr; + c->encoder_id = encoder_id; + c->cmd_3d_id = cmd_3d_id; + c->cmd_ta_id = cmd_ta_id; + c->ppp_ctrl = 0x202; + + c->fb_width = cs->cr.width; + c->fb_height = cs->cr.height; + + c->isp_bgobjdepth = cs->cr.isp_bgobjdepth; + c->isp_bgobjvals = cs->cr.isp_bgobjvals; + + static_assert(sizeof(c->zls_ctrl) == sizeof(cs->cr.zls_control)); + memcpy(&c->zls_ctrl, &cs->cr.zls_control, sizeof(cs->cr.zls_control)); + + c->depth_dimensions = (cs->cr.width - 1) | ((cs->cr.height - 1) << 15); + + c->depth_buffer_load = cs->cr.depth.buffer; + c->depth_buffer_store = cs->cr.depth.buffer; + c->depth_buffer_partial = cs->cr.depth.buffer; + + c->depth_buffer_load_stride = cs->cr.depth.stride; + c->depth_buffer_store_stride = cs->cr.depth.stride; + c->depth_buffer_partial_stride = cs->cr.depth.stride; + + c->depth_meta_buffer_load = cs->cr.depth.meta; + c->depth_meta_buffer_store = cs->cr.depth.meta; + c->depth_meta_buffer_partial = cs->cr.depth.meta; + + c->depth_meta_buffer_load_stride = cs->cr.depth.stride; + c->depth_meta_buffer_store_stride = cs->cr.depth.meta_stride; + c->depth_meta_buffer_partial_stride = cs->cr.depth.meta_stride; + + c->stencil_buffer_load = cs->cr.stencil.buffer; + c->stencil_buffer_store = cs->cr.stencil.buffer; + c->stencil_buffer_partial = cs->cr.stencil.buffer; + + c->stencil_buffer_load_stride = cs->cr.stencil.stride; + c->stencil_buffer_store_stride = cs->cr.stencil.stride; + c->stencil_buffer_partial_stride = cs->cr.stencil.stride; + + c->stencil_meta_buffer_load = cs->cr.stencil.meta; + c->stencil_meta_buffer_store = cs->cr.stencil.meta; + c->stencil_meta_buffer_partial = cs->cr.stencil.meta; + + c->stencil_meta_buffer_load_stride = cs->cr.stencil.stride; + c->stencil_meta_buffer_store_stride = cs->cr.stencil.meta_stride; + c->stencil_meta_buffer_partial_stride = cs->cr.stencil.meta_stride; + + c->iogpu_unk_214 = cs->cr.iogpu_unk_214; + +#if 0 + if (clear_pipeline_textures) + c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S; + else + c->flags |= ASAHI_RENDER_NO_CLEAR_PIPELINE_TEXTURES; + + if (zres && !(batch->clear & PIPE_CLEAR_DEPTH)) + c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S; + + if (sres && !(batch->clear & PIPE_CLEAR_STENCIL)) + c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S; +#endif + + if (dev->dev.debug & AGX_DBG_NOCLUSTER) + c->flags |= ASAHI_RENDER_NO_VERTEX_CLUSTERING; + +#if 0 + /* XXX is this for just MSAA+Z+S or MSAA+(Z|S)? */ + if (tib->nr_samples > 1 && framebuffer->zsbuf) + c->flags |= ASAHI_RENDER_MSAA_ZS; +#endif + + c->utile_width = cs->tib.tile_size.width; + c->utile_height = cs->tib.tile_size.height; + + /* Can be 0 for attachmentless rendering with no draws */ + c->samples = MAX2(cs->tib.nr_samples, 1); + c->layers = cs->cr.layers; + + c->ppp_multisamplectl = cs->ppp_multisamplectl; + c->sample_size = cs->tib.sample_size_B; + + /* XXX OR 0x80 with eMRT? */ + c->tib_blocks = ALIGN_POT(agx_tilebuffer_total_size(&cs->tib), 2048) / 2048; + + float tan_60 = 1.732051f; + c->merge_upper_x = fui(tan_60 / cs->cr.width); + c->merge_upper_y = fui(tan_60 / cs->cr.height); + + c->load_pipeline = cs->cr.bg.main.usc | 4; + c->store_pipeline = cs->cr.eot.main.usc | 4; + c->partial_reload_pipeline = cs->cr.bg.partial.usc | 4; + c->partial_store_pipeline = cs->cr.eot.partial.usc | 4; + + memcpy(&c->load_pipeline_bind, &cs->cr.bg.main.counts, + sizeof(struct agx_counts_packed)); + + memcpy(&c->store_pipeline_bind, &cs->cr.eot.main.counts, + sizeof(struct agx_counts_packed)); + + memcpy(&c->partial_reload_pipeline_bind, &cs->cr.bg.partial.counts, + sizeof(struct agx_counts_packed)); + + memcpy(&c->partial_store_pipeline_bind, &cs->cr.eot.partial.counts, + sizeof(struct agx_counts_packed)); + + c->scissor_array = cs->uploaded_scissor; + c->depth_bias_array = cs->uploaded_zbias; + + c->vertex_sampler_array = dev->samplers.table.bo->ptr.gpu; + c->vertex_sampler_count = dev->samplers.table.alloc; + c->vertex_sampler_max = dev->samplers.table.alloc + 1; + + c->fragment_sampler_array = c->vertex_sampler_array; + c->fragment_sampler_count = c->vertex_sampler_count; + c->fragment_sampler_max = c->vertex_sampler_max; + + c->visibility_result_buffer = dev->occlusion_queries.bo->ptr.gpu; + + /* If a tile is empty, we do not want to process it, as the redundant + * roundtrip of memory-->tilebuffer-->memory wastes a tremendous amount of + * memory bandwidth. Any draw marks a tile as non-empty, so we only need to + * process empty tiles if the background+EOT programs have a side effect. + * This is the case exactly when there is an attachment we are clearing (some + * attachment A in clear and in resolve <==> non-empty intersection). + * + * This case matters a LOT for performance in workloads that split batches. + */ + if (true /* TODO */) + c->flags |= ASAHI_RENDER_PROCESS_EMPTY_TILES; + + if (cs->scratch.vs.main || cs->scratch.vs.preamble) { + c->flags |= ASAHI_RENDER_VERTEX_SPILLS; + c->vertex_helper_arg = dev->scratch.vs.buf->ptr.gpu; + c->vertex_helper_cfg = cs->scratch.vs.preamble << 16; + c->vertex_helper_program = dev->dev.helper->ptr.gpu | 1; + } + + if (cs->scratch.fs.main || cs->scratch.fs.preamble) { + c->fragment_helper_arg = dev->scratch.fs.buf->ptr.gpu; + c->fragment_helper_cfg = cs->scratch.fs.preamble << 16; + c->fragment_helper_program = dev->dev.helper->ptr.gpu | 1; + } +} + +static void +asahi_fill_sync(struct drm_asahi_sync *sync, struct vk_sync *vk_sync, + uint64_t value) +{ + if (unlikely(!vk_sync_type_is_drm_syncobj(vk_sync->type))) { + unreachable("Unsupported sync type"); + return; + } + + const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(vk_sync); + *sync = (struct drm_asahi_sync){.handle = syncobj->syncobj}; + + if (vk_sync->flags & VK_SYNC_IS_TIMELINE) { + sync->sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ; + sync->timeline_value = value; + } else { + sync->sync_type = DRM_ASAHI_SYNC_SYNCOBJ; + } +} + +union drm_asahi_cmd { + struct drm_asahi_cmd_compute compute; + struct drm_asahi_cmd_render render; +}; + +/* TODO: I think it's 64. Can we query from the kernel? */ +#define MAX_COMMANDS_PER_SUBMIT (16) + +static VkResult +queue_submit_single(struct agx_device *dev, struct drm_asahi_submit *submit) +{ + int ret = dev->ops.submit(dev, submit, 0); + + /* XXX: don't trap */ + if (ret) { + fprintf(stderr, "DRM_IOCTL_ASAHI_SUBMIT failed: %m\n"); + assert(0); + } + + return VK_SUCCESS; +} + +/* + * The kernel/firmware jointly impose a limit on commands per submit ioctl, but + * we can build up arbitrarily large command buffers. We handle this here by + * looping the ioctl, submitting slices of the command buffers that are within + * bounds. + */ +static VkResult +queue_submit_looped(struct agx_device *dev, struct drm_asahi_submit *submit) +{ + struct drm_asahi_command *cmds = (void *)submit->commands; + unsigned commands_remaining = submit->command_count; + unsigned submitted_vdm = 0, submitted_cdm = 0; + + while (commands_remaining) { + bool first = commands_remaining == submit->command_count; + bool last = commands_remaining <= MAX_COMMANDS_PER_SUBMIT; + + unsigned count = MIN2(commands_remaining, MAX_COMMANDS_PER_SUBMIT); + commands_remaining -= count; + + assert(!last || commands_remaining == 0); + assert(count > 0); + + /* We need to fix up the barriers since barriers are ioctl-relative */ + for (unsigned i = 0; i < count; ++i) { + assert(cmds[i].barriers[0] >= submitted_vdm); + assert(cmds[i].barriers[1] >= submitted_cdm); + + cmds[i].barriers[0] -= submitted_vdm; + cmds[i].barriers[1] -= submitted_cdm; + } + + /* We can't signal the out-syncobjs until all prior work finishes. Since + * only the last ioctl will signal, make sure it waits on prior ioctls. + * + * TODO: there might be a more performant way to do this. + */ + if (last && !first) { + if (cmds[0].barriers[0] == DRM_ASAHI_BARRIER_NONE) + cmds[0].barriers[0] = 0; + + if (cmds[0].barriers[1] == DRM_ASAHI_BARRIER_NONE) + cmds[0].barriers[1] = 0; + } + + struct drm_asahi_submit submit_ioctl = { + .flags = submit->flags, + .queue_id = submit->queue_id, + .result_handle = submit->result_handle, + .commands = (uint64_t)(uintptr_t)(cmds), + .command_count = count, + .in_syncs = first ? submit->in_syncs : 0, + .in_sync_count = first ? submit->in_sync_count : 0, + .out_syncs = last ? submit->out_syncs : 0, + .out_sync_count = last ? submit->out_sync_count : 0, + }; + + VkResult result = queue_submit_single(dev, &submit_ioctl); + if (result != VK_SUCCESS) + return result; + + for (unsigned i = 0; i < count; ++i) { + if (cmds[i].cmd_type == DRM_ASAHI_CMD_COMPUTE) + submitted_cdm++; + else if (cmds[i].cmd_type == DRM_ASAHI_CMD_RENDER) + submitted_vdm++; + else + unreachable("unknown subqueue"); + } + + cmds += count; + } + + return VK_SUCCESS; +} + +static VkResult +queue_submit(struct hk_device *dev, struct hk_queue *queue, + struct vk_queue_submit *submit) +{ + unsigned command_count = 0; + + /* Gather the number of individual commands to submit up front */ + for (unsigned i = 0; i < submit->command_buffer_count; ++i) { + struct hk_cmd_buffer *cmdbuf = + (struct hk_cmd_buffer *)submit->command_buffers[i]; + + command_count += list_length(&cmdbuf->control_streams); + } + + if (command_count == 0) + return queue_submit_empty(dev, queue, submit); + + unsigned wait_count = 0; + struct drm_asahi_sync *waits = + alloca(submit->wait_count * sizeof(struct drm_asahi_sync)); + + struct drm_asahi_sync *signals = + alloca((submit->signal_count + 1) * sizeof(struct drm_asahi_sync)); + + for (unsigned i = 0; i < submit->wait_count; ++i) { + /* The kernel rejects the submission if we try to wait on the same + * timeline semaphore at multiple points. + * + * TODO: Can we relax the UAPI? + * + * XXX: This is quadratic time. + */ + bool skip = false; + if (submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE) { + uint32_t v1 = submit->waits[i].wait_value; + for (unsigned j = 0; j < submit->wait_count; ++j) { + uint32_t v2 = submit->waits[j].wait_value; + if (i != j && submit->waits[i].sync == submit->waits[j].sync && + (v1 < v2 || (v1 == v2 && i < j))) { + skip = true; + break; + } + } + + if (skip) + continue; + } + + asahi_fill_sync(&waits[wait_count++], submit->waits[i].sync, + submit->waits[i].wait_value); + } + + for (unsigned i = 0; i < submit->signal_count; ++i) { + asahi_fill_sync(&signals[i], submit->signals[i].sync, + submit->signals[i].signal_value); + } + + /* Signal progress on the queue itself */ + signals[submit->signal_count] = (struct drm_asahi_sync){ + .sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ, + .handle = queue->drm.syncobj, + .timeline_value = ++queue->drm.timeline_value, + }; + + /* Now setup the command structs */ + struct drm_asahi_command *cmds = alloca(sizeof(*cmds) * command_count); + union drm_asahi_cmd *cmds_inner = + alloca(sizeof(*cmds_inner) * command_count); + + unsigned cmd_it = 0; + unsigned nr_vdm = 0, nr_cdm = 0; + + for (unsigned i = 0; i < submit->command_buffer_count; ++i) { + struct hk_cmd_buffer *cmdbuf = + (struct hk_cmd_buffer *)submit->command_buffers[i]; + + list_for_each_entry(struct hk_cs, cs, &cmdbuf->control_streams, node) { + assert(cmd_it < command_count); + + struct drm_asahi_command cmd = { + .cmd_buffer = (uint64_t)(uintptr_t)&cmds_inner[cmd_it], + .result_offset = 0 /* TODO */, + .result_size = 0 /* TODO */, + /* Barrier on previous command */ + .barriers = {nr_vdm, nr_cdm}, + }; + + if (cs->type == HK_CS_CDM) { + cmd.cmd_type = DRM_ASAHI_CMD_COMPUTE; + cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute); + nr_cdm++; + + asahi_fill_cdm_command(dev, cs, &cmds_inner[cmd_it].compute); + } else { + assert(cs->type == HK_CS_VDM); + cmd.cmd_type = DRM_ASAHI_CMD_RENDER; + cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_render); + nr_vdm++; + + asahi_fill_vdm_command(dev, cs, &cmds_inner[cmd_it].render); + } + + cmds[cmd_it++] = cmd; + } + } + + assert(cmd_it == command_count); + + if (dev->dev.debug & AGX_DBG_TRACE) { + for (unsigned i = 0; i < command_count; ++i) { + if (cmds[i].cmd_type == DRM_ASAHI_CMD_COMPUTE) { + agxdecode_drm_cmd_compute(dev->dev.agxdecode, &dev->dev.params, + &cmds_inner[i].compute, true); + } else { + assert(cmds[i].cmd_type == DRM_ASAHI_CMD_RENDER); + agxdecode_drm_cmd_render(dev->dev.agxdecode, &dev->dev.params, + &cmds_inner[i].render, true); + } + } + + agxdecode_image_heap(dev->dev.agxdecode, dev->images.bo->ptr.gpu, + dev->images.alloc); + + agxdecode_next_frame(); + } + + struct drm_asahi_submit submit_ioctl = { + .flags = 0, + .queue_id = queue->drm.id, + .result_handle = 0 /* TODO */, + .in_sync_count = wait_count, + .out_sync_count = submit->signal_count + 1, + .command_count = command_count, + .in_syncs = (uint64_t)(uintptr_t)(waits), + .out_syncs = (uint64_t)(uintptr_t)(signals), + .commands = (uint64_t)(uintptr_t)(cmds), + }; + + if (command_count <= MAX_COMMANDS_PER_SUBMIT) + return queue_submit_single(&dev->dev, &submit_ioctl); + else + return queue_submit_looped(&dev->dev, &submit_ioctl); +} + +static VkResult +hk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit) +{ + struct hk_queue *queue = container_of(vk_queue, struct hk_queue, vk); + struct hk_device *dev = hk_queue_device(queue); + + if (vk_queue_is_lost(&queue->vk)) + return VK_ERROR_DEVICE_LOST; + + VkResult result = queue_submit(dev, queue, submit); + if (result != VK_SUCCESS) + return vk_queue_set_lost(&queue->vk, "Submit failed"); + + return VK_SUCCESS; +} + +VkResult +hk_queue_init(struct hk_device *dev, struct hk_queue *queue, + const VkDeviceQueueCreateInfo *pCreateInfo, + uint32_t index_in_family) +{ + struct hk_physical_device *pdev = hk_device_physical(dev); + VkResult result; + + assert(pCreateInfo->queueFamilyIndex < pdev->queue_family_count); + + const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info = + vk_find_struct_const(pCreateInfo->pNext, + DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR); + const enum VkQueueGlobalPriorityKHR global_priority = + priority_info ? priority_info->globalPriority + : VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR; + + if (global_priority != VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) { + return VK_ERROR_INITIALIZATION_FAILED; + } + + result = vk_queue_init(&queue->vk, &dev->vk, pCreateInfo, index_in_family); + if (result != VK_SUCCESS) + return result; + + queue->vk.driver_submit = hk_queue_submit; + + queue->drm.id = agx_create_command_queue(&dev->dev, + DRM_ASAHI_QUEUE_CAP_RENDER | + DRM_ASAHI_QUEUE_CAP_BLIT | + DRM_ASAHI_QUEUE_CAP_COMPUTE, + 2); + + if (drmSyncobjCreate(dev->dev.fd, 0, &queue->drm.syncobj)) { + mesa_loge("drmSyncobjCreate() failed %d\n", errno); + agx_destroy_command_queue(&dev->dev, queue->drm.id); + vk_queue_finish(&queue->vk); + + return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY, + "DRM_IOCTL_SYNCOBJ_CREATE failed: %m"); + } + + uint64_t initial_value = 1; + if (drmSyncobjTimelineSignal(dev->dev.fd, &queue->drm.syncobj, + &initial_value, 1)) { + hk_queue_finish(dev, queue); + return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY, + "DRM_IOCTL_TIMELINE_SYNCOBJ_SIGNAL failed: %m"); + } + + return VK_SUCCESS; +} + +void +hk_queue_finish(struct hk_device *dev, struct hk_queue *queue) +{ + drmSyncobjDestroy(dev->dev.fd, queue->drm.syncobj); + agx_destroy_command_queue(&dev->dev, queue->drm.id); + vk_queue_finish(&queue->vk); +} diff --git a/src/asahi/vulkan/hk_queue.h b/src/asahi/vulkan/hk_queue.h new file mode 100644 index 00000000000..42e446ba430 --- /dev/null +++ b/src/asahi/vulkan/hk_queue.h @@ -0,0 +1,42 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "hk_private.h" +#include "vk_queue.h" + +struct hk_device; + +struct hk_queue { + struct vk_queue vk; + + struct { + /* Asahi kernel queue ID */ + uint32_t id; + + /* Timeline syncobj backing the queue */ + uint32_t syncobj; + + /* Current maximum timeline value for the queue's syncobj. If the + * syncobj's value equals timeline_value, then all work is complete. + */ + uint32_t timeline_value; + } drm; +}; + +static inline struct hk_device * +hk_queue_device(struct hk_queue *queue) +{ + return (struct hk_device *)queue->vk.base.device; +} + +VkResult hk_queue_init(struct hk_device *dev, struct hk_queue *queue, + const VkDeviceQueueCreateInfo *pCreateInfo, + uint32_t index_in_family); + +void hk_queue_finish(struct hk_device *dev, struct hk_queue *queue); diff --git a/src/asahi/vulkan/hk_sampler.c b/src/asahi/vulkan/hk_sampler.c new file mode 100644 index 00000000000..7e936b0cb04 --- /dev/null +++ b/src/asahi/vulkan/hk_sampler.c @@ -0,0 +1,281 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_sampler.h" + +#include "hk_device.h" +#include "hk_entrypoints.h" +#include "hk_physical_device.h" + +#include "vk_enum_to_str.h" +#include "vk_format.h" +#include "vk_sampler.h" + +#include "asahi/genxml/agx_pack.h" + +static inline uint32_t +translate_address_mode(VkSamplerAddressMode addr_mode) +{ +#define MODE(VK, AGX_) [VK_SAMPLER_ADDRESS_MODE_##VK] = AGX_WRAP_##AGX_ + static const uint8_t translate[] = { + MODE(REPEAT, REPEAT), + MODE(MIRRORED_REPEAT, MIRRORED_REPEAT), + MODE(CLAMP_TO_EDGE, CLAMP_TO_EDGE), + MODE(CLAMP_TO_BORDER, CLAMP_TO_BORDER), + MODE(MIRROR_CLAMP_TO_EDGE, MIRRORED_CLAMP_TO_EDGE), + }; +#undef MODE + + assert(addr_mode < ARRAY_SIZE(translate)); + return translate[addr_mode]; +} + +static uint32_t +translate_texsamp_compare_op(VkCompareOp op) +{ +#define OP(VK, AGX_) [VK_COMPARE_OP_##VK] = AGX_COMPARE_FUNC_##AGX_ + static const uint8_t translate[] = { + OP(NEVER, NEVER), + OP(LESS, LESS), + OP(EQUAL, EQUAL), + OP(LESS_OR_EQUAL, LEQUAL), + OP(GREATER, GREATER), + OP(NOT_EQUAL, NOT_EQUAL), + OP(GREATER_OR_EQUAL, GEQUAL), + OP(ALWAYS, ALWAYS), + }; +#undef OP + + assert(op < ARRAY_SIZE(translate)); + return translate[op]; +} + +static enum agx_filter +translate_filter(VkFilter filter) +{ + static_assert((enum agx_filter)VK_FILTER_NEAREST == AGX_FILTER_NEAREST); + static_assert((enum agx_filter)VK_FILTER_LINEAR == AGX_FILTER_LINEAR); + + return (enum agx_filter)filter; +} + +static enum agx_mip_filter +translate_mipfilter(VkSamplerMipmapMode mode) +{ + switch (mode) { + case VK_SAMPLER_MIPMAP_MODE_NEAREST: + return AGX_MIP_FILTER_NEAREST; + + case VK_SAMPLER_MIPMAP_MODE_LINEAR: + return AGX_MIP_FILTER_LINEAR; + + default: + unreachable("Invalid filter"); + } +} + +static bool +uses_border(const VkSamplerCreateInfo *info) +{ + return info->addressModeU == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER || + info->addressModeV == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER || + info->addressModeW == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; +} + +static enum agx_border_colour +is_border_color_custom(VkBorderColor color) +{ + /* TODO: for now, opaque black is treated as custom due to rgba4 swizzling + * issues, could be optimized though. + */ + switch (color) { + case VK_BORDER_COLOR_INT_OPAQUE_BLACK: + case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK: + case VK_BORDER_COLOR_INT_CUSTOM_EXT: + case VK_BORDER_COLOR_FLOAT_CUSTOM_EXT: + return true; + default: + return false; + } +} + +/* Translate an American VkBorderColor into a Canadian agx_border_colour */ +static enum agx_border_colour +translate_border_color(VkBorderColor color, bool custom_to_1) +{ + switch (color) { + case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK: + case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK: + return AGX_BORDER_COLOUR_TRANSPARENT_BLACK; + + case VK_BORDER_COLOR_INT_OPAQUE_WHITE: + case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE: + return AGX_BORDER_COLOUR_OPAQUE_WHITE; + + default: + assert(is_border_color_custom(color)); + return custom_to_1 ? AGX_BORDER_COLOUR_OPAQUE_WHITE + : AGX_BORDER_COLOUR_TRANSPARENT_BLACK; + } +} + +static void +pack_sampler(const struct hk_physical_device *pdev, + const struct VkSamplerCreateInfo *info, bool custom_to_1, + struct agx_sampler_packed *out) +{ + agx_pack(out, SAMPLER, cfg) { + cfg.minimum_lod = info->minLod; + cfg.maximum_lod = info->maxLod; + cfg.magnify = translate_filter(info->magFilter); + cfg.minify = translate_filter(info->minFilter); + cfg.mip_filter = translate_mipfilter(info->mipmapMode); + cfg.wrap_s = translate_address_mode(info->addressModeU); + cfg.wrap_t = translate_address_mode(info->addressModeV); + cfg.wrap_r = translate_address_mode(info->addressModeW); + cfg.pixel_coordinates = info->unnormalizedCoordinates; + + cfg.seamful_cube_maps = + info->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT; + + if (info->compareEnable) { + cfg.compare_func = translate_texsamp_compare_op(info->compareOp); + cfg.compare_enable = true; + } + + if (info->anisotropyEnable) { + cfg.maximum_anisotropy = + util_next_power_of_two(MAX2(info->maxAnisotropy, 1)); + } else { + cfg.maximum_anisotropy = 1; + } + + if (uses_border(info)) { + cfg.border_colour = + translate_border_color(info->borderColor, custom_to_1); + } + } +} + +VKAPI_ATTR VkResult VKAPI_CALL +hk_CreateSampler(VkDevice device, + const VkSamplerCreateInfo *info /* pCreateInfo */, + const VkAllocationCallbacks *pAllocator, VkSampler *pSampler) +{ + VK_FROM_HANDLE(hk_device, dev, device); + struct hk_physical_device *pdev = hk_device_physical(dev); + struct hk_sampler *sampler; + VkResult result; + + sampler = vk_sampler_create(&dev->vk, info, pAllocator, sizeof(*sampler)); + if (!sampler) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + struct agx_sampler_packed samp; + pack_sampler(pdev, info, true, &samp); + + /* LOD bias passed in the descriptor set */ + sampler->lod_bias_fp16 = _mesa_float_to_half(info->mipLodBias); + + result = + hk_sampler_heap_add(dev, samp, &sampler->planes[sampler->plane_count].hw); + if (result != VK_SUCCESS) { + hk_DestroySampler(device, hk_sampler_to_handle(sampler), pAllocator); + return result; + } + + sampler->plane_count++; + + /* In order to support CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT, we + * need multiple sampler planes: at minimum we will need one for luminance + * (the default), and one for chroma. Each sampler plane needs its own + * sampler table entry. However, sampler table entries are very rare on + * G13, and each plane would burn one of those. So we make sure to allocate + * only the minimum amount that we actually need (i.e., either 1 or 2), and + * then just copy the last sampler plane out as far as we need to fill the + * number of image planes. + */ + if (sampler->vk.ycbcr_conversion) { + assert(!uses_border(info) && + "consequence of VUID-VkSamplerCreateInfo-addressModeU-01646"); + + const VkFilter chroma_filter = + sampler->vk.ycbcr_conversion->state.chroma_filter; + if (info->magFilter != chroma_filter || + info->minFilter != chroma_filter) { + VkSamplerCreateInfo plane2_info = *info; + plane2_info.magFilter = chroma_filter; + plane2_info.minFilter = chroma_filter; + + pack_sampler(pdev, &plane2_info, false, &samp); + result = hk_sampler_heap_add( + dev, samp, &sampler->planes[sampler->plane_count].hw); + + if (result != VK_SUCCESS) { + hk_DestroySampler(device, hk_sampler_to_handle(sampler), + pAllocator); + return result; + } + + sampler->plane_count++; + } + } else if (uses_border(info)) { + /* If the sampler uses custom border colours, we need both clamp-to-1 + * and clamp-to-0 variants. We treat these as planes. + */ + pack_sampler(pdev, info, false, &samp); + result = hk_sampler_heap_add(dev, samp, + &sampler->planes[sampler->plane_count].hw); + + if (result != VK_SUCCESS) { + hk_DestroySampler(device, hk_sampler_to_handle(sampler), pAllocator); + return result; + } + + sampler->plane_count++; + + /* We also need to record the border. + * + * If there is a border colour component mapping, we need to swizzle with + * it. Otherwise, we can assume there's nothing to do. + */ + VkClearColorValue bc = sampler->vk.border_color_value; + + const VkSamplerBorderColorComponentMappingCreateInfoEXT *swiz_info = + vk_find_struct_const( + info->pNext, + SAMPLER_BORDER_COLOR_COMPONENT_MAPPING_CREATE_INFO_EXT); + + if (swiz_info) { + const bool is_int = vk_border_color_is_int(info->borderColor); + bc = vk_swizzle_color_value(bc, swiz_info->components, is_int); + } + + sampler->custom_border = bc; + sampler->has_border = true; + } + + *pSampler = hk_sampler_to_handle(sampler); + + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +hk_DestroySampler(VkDevice device, VkSampler _sampler, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(hk_device, dev, device); + VK_FROM_HANDLE(hk_sampler, sampler, _sampler); + + if (!sampler) + return; + + for (uint8_t plane = 0; plane < sampler->plane_count; plane++) { + hk_sampler_heap_remove(dev, sampler->planes[plane].hw); + } + + vk_sampler_destroy(&dev->vk, pAllocator, &sampler->vk); +} diff --git a/src/asahi/vulkan/hk_sampler.h b/src/asahi/vulkan/hk_sampler.h new file mode 100644 index 00000000000..444aabc8d65 --- /dev/null +++ b/src/asahi/vulkan/hk_sampler.h @@ -0,0 +1,33 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "hk_device.h" +#include "hk_physical_device.h" +#include "hk_private.h" + +#include "vk_sampler.h" +#include "vk_ycbcr_conversion.h" + +#include "vk_format.h" + +struct hk_sampler { + struct vk_sampler vk; + VkClearColorValue custom_border; + bool has_border; + + uint8_t plane_count; + uint16_t lod_bias_fp16; + + struct { + struct hk_rc_sampler *hw; + } planes[2]; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(hk_sampler, vk.base, VkSampler, + VK_OBJECT_TYPE_SAMPLER) diff --git a/src/asahi/vulkan/hk_shader.c b/src/asahi/vulkan/hk_shader.c new file mode 100644 index 00000000000..60303963fd7 --- /dev/null +++ b/src/asahi/vulkan/hk_shader.c @@ -0,0 +1,1432 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_shader.h" + +#include "agx_helpers.h" +#include "agx_nir_lower_gs.h" +#include "glsl_types.h" +#include "nir.h" +#include "nir_builder.h" + +#include "agx_bo.h" +#include "hk_cmd_buffer.h" +#include "hk_descriptor_set_layout.h" +#include "hk_device.h" +#include "hk_physical_device.h" +#include "hk_sampler.h" +#include "hk_shader.h" + +#include "nir_builder_opcodes.h" +#include "nir_builtin_builder.h" +#include "nir_intrinsics.h" +#include "nir_intrinsics_indices.h" +#include "nir_xfb_info.h" +#include "shader_enums.h" +#include "vk_nir_convert_ycbcr.h" +#include "vk_pipeline.h" +#include "vk_pipeline_layout.h" +#include "vk_shader_module.h" +#include "vk_ycbcr_conversion.h" + +#include "asahi/compiler/agx_compile.h" +#include "asahi/lib/agx_linker.h" +#include "asahi/lib/agx_nir_passes.h" +#include "asahi/lib/agx_tilebuffer.h" +#include "asahi/lib/agx_uvs.h" +#include "compiler/spirv/nir_spirv.h" + +#include "util/blob.h" +#include "util/hash_table.h" +#include "util/macros.h" +#include "util/mesa-sha1.h" +#include "util/simple_mtx.h" +#include "util/u_debug.h" +#include "vulkan/vulkan_core.h" + +struct hk_fs_key { + bool zs_self_dep; + + /** True if sample shading is forced on via an API knob such as + * VkPipelineMultisampleStateCreateInfo::minSampleShading + */ + bool force_sample_shading; + + uint8_t pad[2]; +}; +static_assert(sizeof(struct hk_fs_key) == 4, "packed"); + +static void +shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + uint32_t comp_size = + glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + *size = comp_size * length, *align = comp_size; +} + +uint64_t +hk_physical_device_compiler_flags(const struct hk_physical_device *pdev) +{ + /* TODO compiler flags */ + return 0; +} + +const nir_shader_compiler_options * +hk_get_nir_options(struct vk_physical_device *vk_pdev, gl_shader_stage stage, + UNUSED const struct vk_pipeline_robustness_state *rs) +{ + return &agx_nir_options; +} + +static struct spirv_to_nir_options +hk_get_spirv_options(struct vk_physical_device *vk_pdev, + UNUSED gl_shader_stage stage, + const struct vk_pipeline_robustness_state *rs) +{ + return (struct spirv_to_nir_options){ + .ssbo_addr_format = hk_buffer_addr_format(rs->storage_buffers), + .phys_ssbo_addr_format = nir_address_format_64bit_global, + .ubo_addr_format = hk_buffer_addr_format(rs->uniform_buffers), + .shared_addr_format = nir_address_format_32bit_offset, + .min_ssbo_alignment = HK_MIN_SSBO_ALIGNMENT, + .min_ubo_alignment = HK_MIN_UBO_ALIGNMENT, + }; +} + +static bool +lower_halt_to_return(nir_builder *b, nir_instr *instr, UNUSED void *_data) +{ + if (instr->type != nir_instr_type_jump) + return false; + + nir_jump_instr *jump = nir_instr_as_jump(instr); + if (jump->type != nir_jump_halt) + return false; + + assert(b->impl == nir_shader_get_entrypoint(b->shader)); + jump->type = nir_jump_return; + return true; +} + +void +hk_preprocess_nir_internal(struct vk_physical_device *vk_pdev, nir_shader *nir) +{ + /* Must lower before io to temps */ + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, nir_lower_terminate_to_demote); + NIR_PASS(_, nir, nir_shader_instructions_pass, lower_halt_to_return, + nir_metadata_all, NULL); + NIR_PASS(_, nir, nir_lower_returns); + } + + /* Unroll loops before lowering indirects via nir_lower_io_to_temporaries */ + UNUSED bool progress = false; + NIR_PASS(_, nir, nir_lower_global_vars_to_local); + + do { + progress = false; + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_opt_loop); + NIR_PASS(progress, nir, nir_opt_loop_unroll); + } while (progress); + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + struct nir_lower_sysvals_to_varyings_options sysvals_opts = { + .point_coord = true, + }; + + nir_lower_sysvals_to_varyings(nir, &sysvals_opts); + } + + NIR_PASS(_, nir, nir_lower_system_values); + + /* Gather info before preprocess_nir but after some general lowering, so + * inputs_read and system_values_read are accurately set. + */ + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + + NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), + true, false); + + NIR_PASS(_, nir, nir_lower_global_vars_to_local); + + NIR_PASS(_, nir, nir_split_var_copies); + NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp); + + /* Optimize but allow copies because we haven't lowered them yet */ + agx_preprocess_nir(nir, NULL); + + NIR_PASS(_, nir, nir_lower_load_const_to_scalar); + NIR_PASS(_, nir, nir_lower_var_copies); +} + +static void +hk_preprocess_nir(struct vk_physical_device *vk_pdev, nir_shader *nir) +{ + hk_preprocess_nir_internal(vk_pdev, nir); + nir_lower_compute_system_values_options csv_options = { + .has_base_workgroup_id = true, + }; + NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options); +} + +static void +hk_populate_fs_key(struct hk_fs_key *key, + const struct vk_graphics_pipeline_state *state) +{ + memset(key, 0, sizeof(*key)); + + if (state == NULL) + return; + + if (state->pipeline_flags & + VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) + key->zs_self_dep = true; + + /* We force per-sample interpolation whenever sampleShadingEnable is set + * regardless of minSampleShading or rasterizationSamples. + * + * When sampleShadingEnable is set, few guarantees are made about the + * location of interpolation of the inputs. The only real guarantees are + * that the inputs are interpolated within the pixel and that you get at + * least `rasterizationSamples * minSampleShading` unique positions. + * Importantly, it does not require that when `rasterizationSamples * + * minSampleShading <= 1.0` that those positions are at the fragment + * center. Therefore, it's valid to just always do per-sample all the time. + * + * The one caveat here is that we have to be careful about gl_SampleMaskIn. + * When `hk_fs_key::force_sample_shading = true` we also turn any reads of + * gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask + * is actually per-fragment, not per-pass. We handle this by smashing + * minSampleShading to 1.0 whenever gl_SampleMaskIn is read. + */ + const struct vk_multisample_state *ms = state->ms; + if (ms != NULL && ms->sample_shading_enable) + key->force_sample_shading = true; +} + +static void +hk_hash_graphics_state(struct vk_physical_device *device, + const struct vk_graphics_pipeline_state *state, + VkShaderStageFlags stages, blake3_hash blake3_out) +{ + struct mesa_blake3 blake3_ctx; + _mesa_blake3_init(&blake3_ctx); + if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) { + struct hk_fs_key key; + hk_populate_fs_key(&key, state); + _mesa_blake3_update(&blake3_ctx, &key, sizeof(key)); + + const bool is_multiview = state->rp->view_mask != 0; + _mesa_blake3_update(&blake3_ctx, &is_multiview, sizeof(is_multiview)); + } + _mesa_blake3_final(&blake3_ctx, blake3_out); +} + +static bool +lower_load_global_constant_offset_instr(nir_builder *b, + nir_intrinsic_instr *intrin, + UNUSED void *_data) +{ + if (intrin->intrinsic != nir_intrinsic_load_global_constant_offset && + intrin->intrinsic != nir_intrinsic_load_global_constant_bounded) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *base_addr = intrin->src[0].ssa; + nir_def *offset = intrin->src[1].ssa; + + nir_def *zero = NULL; + if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) { + nir_def *bound = intrin->src[2].ssa; + + unsigned bit_size = intrin->def.bit_size; + assert(bit_size >= 8 && bit_size % 8 == 0); + unsigned byte_size = bit_size / 8; + + zero = nir_imm_zero(b, intrin->num_components, bit_size); + + unsigned load_size = byte_size * intrin->num_components; + + nir_def *sat_offset = + nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1))); + nir_def *in_bounds = + nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound); + + nir_push_if(b, in_bounds); + } + + nir_def *val = nir_build_load_global_constant( + b, intrin->def.num_components, intrin->def.bit_size, + nir_iadd(b, base_addr, nir_u2u64(b, offset)), + .align_mul = nir_intrinsic_align_mul(intrin), + .align_offset = nir_intrinsic_align_offset(intrin), + .access = nir_intrinsic_access(intrin)); + + if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) { + nir_pop_if(b, NULL); + val = nir_if_phi(b, val, zero); + } + + nir_def_rewrite_uses(&intrin->def, val); + + return true; +} + +struct lower_ycbcr_state { + uint32_t set_layout_count; + struct vk_descriptor_set_layout *const *set_layouts; +}; + +static const struct vk_ycbcr_conversion_state * +lookup_ycbcr_conversion(const void *_state, uint32_t set, uint32_t binding, + uint32_t array_index) +{ + const struct lower_ycbcr_state *state = _state; + assert(set < state->set_layout_count); + assert(state->set_layouts[set] != NULL); + const struct hk_descriptor_set_layout *set_layout = + vk_to_hk_descriptor_set_layout(state->set_layouts[set]); + assert(binding < set_layout->binding_count); + + const struct hk_descriptor_set_binding_layout *bind_layout = + &set_layout->binding[binding]; + + if (bind_layout->immutable_samplers == NULL) + return NULL; + + array_index = MIN2(array_index, bind_layout->array_size - 1); + + const struct hk_sampler *sampler = + bind_layout->immutable_samplers[array_index]; + + return sampler && sampler->vk.ycbcr_conversion + ? &sampler->vk.ycbcr_conversion->state + : NULL; +} + +static inline bool +nir_has_image_var(nir_shader *nir) +{ + nir_foreach_image_variable(_, nir) + return true; + + return false; +} + +static int +glsl_type_size(const struct glsl_type *type, bool bindless) +{ + return glsl_count_attribute_slots(type, false); +} + +/* + * This is the world's worst multiview implementation. We simply duplicate each + * draw on the CPU side, changing a uniform in between, and then plumb the view + * index into the layer ID here. Whatever, it works. + * + * The "proper" implementation on AGX would use vertex amplification, but a + * MacBook is not a VR headset. + */ +static void +hk_lower_multiview(nir_shader *nir) +{ + /* If there's an existing layer ID write, ignore it. This avoids validation + * splat with vk_meta. + */ + nir_variable *existing = nir_find_variable_with_location( + nir, nir_var_shader_out, VARYING_SLOT_LAYER); + + if (existing) { + existing->data.mode = nir_var_shader_temp; + existing->data.location = 0; + nir_fixup_deref_modes(nir); + } + + /* Now write the view index as the layer */ + nir_builder b = + nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir))); + + nir_variable *layer = + nir_variable_create(nir, nir_var_shader_out, glsl_uint_type(), NULL); + + layer->data.location = VARYING_SLOT_LAYER; + + nir_store_var(&b, layer, nir_load_view_index(&b), nir_component_mask(1)); + b.shader->info.outputs_written |= VARYING_BIT_LAYER; +} + +/* + * KHR_maintenance5 requires that points rasterize with a default point size of + * 1.0, while our hardware requires an explicit point size write for this. + * Since topology may be dynamic, we insert an unconditional write if necessary. + */ +static bool +hk_nir_insert_psiz_write(nir_shader *nir) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + if (nir->info.outputs_written & VARYING_BIT_PSIZ) { + nir_metadata_preserve(impl, nir_metadata_all); + return false; + } + + nir_builder b = nir_builder_at(nir_after_impl(impl)); + + nir_store_output(&b, nir_imm_float(&b, 1.0), nir_imm_int(&b, 0), + .write_mask = nir_component_mask(1), + .io_semantics.location = VARYING_SLOT_PSIZ, + .io_semantics.num_slots = 1, .src_type = nir_type_float32); + + nir->info.outputs_written |= VARYING_BIT_PSIZ; + nir_metadata_preserve(b.impl, nir_metadata_control_flow); + return true; +} + +static nir_def * +query_custom_border(nir_builder *b, nir_tex_instr *tex) +{ + return nir_build_texture_query(b, tex, nir_texop_custom_border_color_agx, 4, + tex->dest_type, false, false); +} + +static nir_def * +has_custom_border(nir_builder *b, nir_tex_instr *tex) +{ + return nir_build_texture_query(b, tex, nir_texop_has_custom_border_color_agx, + 1, nir_type_bool1, false, false); +} + +static bool +lower(nir_builder *b, nir_instr *instr, UNUSED void *_data) +{ + if (instr->type != nir_instr_type_tex) + return false; + + nir_tex_instr *tex = nir_instr_as_tex(instr); + if (!nir_tex_instr_need_sampler(tex) || nir_tex_instr_is_query(tex)) + return false; + + /* XXX: this is a really weird edge case, is this even well-defined? */ + if (tex->is_shadow) + return false; + + b->cursor = nir_after_instr(&tex->instr); + nir_def *has_custom = has_custom_border(b, tex); + + nir_instr *orig = nir_instr_clone(b->shader, &tex->instr); + nir_builder_instr_insert(b, orig); + nir_def *clamp_to_1 = &nir_instr_as_tex(orig)->def; + + nir_push_if(b, has_custom); + nir_def *replaced = NULL; + { + /* Sample again, this time with clamp-to-0 instead of clamp-to-1 */ + nir_instr *clone_instr = nir_instr_clone(b->shader, &tex->instr); + nir_builder_instr_insert(b, clone_instr); + + nir_tex_instr *tex_0 = nir_instr_as_tex(clone_instr); + nir_def *clamp_to_0 = &tex_0->def; + + tex_0->backend_flags |= AGX_TEXTURE_FLAG_CLAMP_TO_0; + + /* Grab the border colour */ + nir_def *border = query_custom_border(b, tex_0); + + if (tex->op == nir_texop_tg4) { + border = nir_replicate(b, nir_channel(b, border, tex->component), 4); + } + + /* Combine together with the border */ + if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float && + tex->op != nir_texop_tg4) { + + /* For floats, lerp together: + * + * For border texels: (1 * border) + (0 * border ) = border + * For regular texels: (x * border) + (x * (1 - border)) = x. + * + * Linear filtering is linear (duh), so lerping is compatible. + */ + replaced = nir_flrp(b, clamp_to_0, clamp_to_1, border); + } else { + /* For integers, just select componentwise since there is no linear + * filtering. Gathers also use this path since they are unfiltered in + * each component. + */ + replaced = nir_bcsel(b, nir_ieq(b, clamp_to_0, clamp_to_1), clamp_to_0, + border); + } + } + nir_pop_if(b, NULL); + + /* Put it together with a phi */ + nir_def *phi = nir_if_phi(b, replaced, clamp_to_1); + nir_def_replace(&tex->def, phi); + return true; +} + +static bool +agx_nir_lower_custom_border(nir_shader *nir) +{ + return nir_shader_instructions_pass(nir, lower, nir_metadata_none, NULL); +} + +/* + * In Vulkan, the VIEWPORT should read 0 in the fragment shader if it is not + * written by the vertex shader, but in our implementation, the varying would + * otherwise be undefined. This small pass predicates VIEWPORT reads based on + * whether the hardware vertex shader writes the VIEWPORT (nonzero UVS index). + */ +static bool +lower_viewport_fs(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data) +{ + if (intr->intrinsic != nir_intrinsic_load_input) + return false; + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + if (sem.location != VARYING_SLOT_VIEWPORT) + return false; + + b->cursor = nir_after_instr(&intr->instr); + nir_def *orig = &intr->def; + + nir_def *uvs = nir_load_uvs_index_agx(b, .io_semantics = sem); + nir_def *def = nir_bcsel(b, nir_ine_imm(b, uvs, 0), orig, nir_imm_int(b, 0)); + + nir_def_rewrite_uses_after(orig, def, def->parent_instr); + return true; +} + +static bool +lower_subpass_dim(nir_builder *b, nir_instr *instr, UNUSED void *_data) +{ + if (instr->type != nir_instr_type_tex) + return false; + + nir_tex_instr *tex = nir_instr_as_tex(instr); + if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS) + tex->sampler_dim = GLSL_SAMPLER_DIM_2D; + else if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) + tex->sampler_dim = GLSL_SAMPLER_DIM_MS; + else + return false; + + return true; +} + +void +hk_lower_nir(struct hk_device *dev, nir_shader *nir, + const struct vk_pipeline_robustness_state *rs, bool is_multiview, + uint32_t set_layout_count, + struct vk_descriptor_set_layout *const *set_layouts) +{ + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, nir_lower_input_attachments, + &(nir_input_attachment_options){ + .use_fragcoord_sysval = true, + .use_layer_id_sysval = true, + .use_view_id_for_layer = is_multiview, + }); + + NIR_PASS(_, nir, nir_shader_instructions_pass, lower_subpass_dim, + nir_metadata_all, NULL); + NIR_PASS(_, nir, nir_lower_wpos_center); + } + + /* XXX: should be last geometry stage, how do I get to that? */ + if (nir->info.stage == MESA_SHADER_VERTEX) { + if (is_multiview) + hk_lower_multiview(nir); + } + + if (nir->info.stage == MESA_SHADER_TESS_EVAL) { + NIR_PASS(_, nir, nir_lower_patch_vertices, + nir->info.tess.tcs_vertices_out, NULL); + } + + const struct lower_ycbcr_state ycbcr_state = { + .set_layout_count = set_layout_count, + .set_layouts = set_layouts, + }; + NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion, + &ycbcr_state); + + /* Lower push constants before lower_descriptors */ + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const, + nir_address_format_32bit_offset); + + // NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32); + + /* Images accessed through the texture or PBE hardware are robust, so we + * don't set lower_image. (There are some sticky details around txf but + * they're handled by agx_nir_lower_texture). However, image atomics are + * software so require robustness lowering. + */ + nir_lower_robust_access_options robustness = { + .lower_image_atomic = true, + }; + + NIR_PASS(_, nir, nir_lower_robust_access, &robustness); + + /* We must do early lowering before hk_nir_lower_descriptors, since this will + * create lod_bias_agx instructions. + */ + NIR_PASS(_, nir, agx_nir_lower_texture_early, true /* support_lod_bias */); + NIR_PASS(_, nir, agx_nir_lower_custom_border); + + NIR_PASS(_, nir, hk_nir_lower_descriptors, rs, set_layout_count, + set_layouts); + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global, + nir_address_format_64bit_global); + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo, + hk_buffer_addr_format(rs->storage_buffers)); + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo, + hk_buffer_addr_format(rs->uniform_buffers)); + NIR_PASS(_, nir, nir_shader_intrinsics_pass, + lower_load_global_constant_offset_instr, nir_metadata_none, NULL); + + if (!nir->info.shared_memory_explicit_layout) { + /* There may be garbage in shared_size, but it's the job of + * nir_lower_vars_to_explicit_types to allocate it. We have to reset to + * avoid overallocation. + */ + nir->info.shared_size = 0; + + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, + shared_var_info); + } + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared, + nir_address_format_32bit_offset); + + if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) { + /* Align everything up to 16B so we can write whole vec4s. */ + nir->info.shared_size = align(nir->info.shared_size, 16); + NIR_PASS(_, nir, nir_zero_initialize_shared_memory, nir->info.shared_size, + 16); + + /* We need to call lower_compute_system_values again because + * nir_zero_initialize_shared_memory generates load_invocation_id which + * has to be lowered to load_invocation_index. + */ + NIR_PASS(_, nir, nir_lower_compute_system_values, NULL); + } + + /* TODO: we can do indirect VS output */ + nir_variable_mode lower_indirect_modes = 0; + if (nir->info.stage == MESA_SHADER_FRAGMENT) + lower_indirect_modes |= nir_var_shader_out; + else if (nir->info.stage == MESA_SHADER_VERTEX) + lower_indirect_modes |= nir_var_shader_in | nir_var_shader_out; + + NIR_PASS(_, nir, nir_lower_indirect_derefs, lower_indirect_modes, + UINT32_MAX); + + NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + glsl_type_size, nir_lower_io_lower_64bit_to_32); + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_viewport_fs, + nir_metadata_control_flow, NULL); + } + + NIR_PASS(_, nir, agx_nir_lower_texture); + NIR_PASS(_, nir, agx_nir_lower_multisampled_image_store); + + agx_preprocess_nir(nir, dev->dev.libagx); + NIR_PASS(_, nir, nir_opt_conditional_discard); + NIR_PASS(_, nir, nir_opt_if, + nir_opt_if_optimize_phi_true_false | nir_opt_if_avoid_64bit_phis); +} + +static void +hk_upload_shader(struct hk_device *dev, struct hk_shader *shader) +{ + if (shader->b.info.has_preamble) { + unsigned offs = shader->b.info.preamble_offset; + assert(offs < shader->b.binary_size); + + size_t size = shader->b.binary_size - offs; + assert(size > 0); + + shader->bo = agx_bo_create(&dev->dev, size, AGX_BO_EXEC | AGX_BO_LOW_VA, + "Preamble"); + memcpy(shader->bo->ptr.cpu, shader->b.binary + offs, size); + shader->preamble_addr = shader->bo->ptr.gpu; + } + + if (!shader->linked.ht) { + /* If we only have a single variant, link now. */ + shader->only_linked = hk_fast_link(dev, false, shader, NULL, NULL, 0); + } + + if (shader->info.stage == MESA_SHADER_FRAGMENT) { + agx_pack(&shader->frag_face, FRAGMENT_FACE_2, cfg) { + cfg.conservative_depth = + agx_translate_depth_layout(shader->b.info.depth_layout); + } + } + + agx_pack(&shader->counts, COUNTS, cfg) { + cfg.uniform_register_count = shader->b.info.push_count; + cfg.preshader_register_count = shader->b.info.nr_preamble_gprs; + cfg.sampler_state_register_count = agx_translate_sampler_state_count( + shader->b.info.uses_txf ? 1 : 0, false); + } +} + +DERIVE_HASH_TABLE(hk_fast_link_key_vs); +DERIVE_HASH_TABLE(hk_fast_link_key_fs); + +static VkResult +hk_init_link_ht(struct hk_shader *shader, gl_shader_stage sw_stage) +{ + simple_mtx_init(&shader->linked.lock, mtx_plain); + + bool multiple_variants = + sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_FRAGMENT; + + if (!multiple_variants) + return VK_SUCCESS; + + if (sw_stage == MESA_SHADER_VERTEX) + shader->linked.ht = hk_fast_link_key_vs_table_create(NULL); + else + shader->linked.ht = hk_fast_link_key_fs_table_create(NULL); + + return (shader->linked.ht == NULL) ? VK_ERROR_OUT_OF_HOST_MEMORY + : VK_SUCCESS; +} + +static VkResult +hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator, + nir_shader *nir, VkShaderCreateFlagsEXT shader_flags, + const struct vk_pipeline_robustness_state *rs, + const struct hk_fs_key *fs_key, struct hk_shader *shader, + gl_shader_stage sw_stage, bool hw, nir_xfb_info *xfb_info) +{ + unsigned vs_uniform_base = 0; + + /* For now, only shader objects are supported */ + if (sw_stage == MESA_SHADER_VERTEX) { + vs_uniform_base = + 6 * DIV_ROUND_UP( + BITSET_LAST_BIT(shader->info.vs.attrib_components_read), 4); + } else if (sw_stage == MESA_SHADER_FRAGMENT) { + shader->info.fs.interp = agx_gather_interp_info(nir); + shader->info.fs.writes_memory = nir->info.writes_memory; + + /* Discards must be lowering before lowering MSAA to handle discards */ + NIR_PASS(_, nir, agx_nir_lower_discard_zs_emit); + NIR_PASS(_, nir, agx_nir_lower_fs_output_to_epilog, + &shader->info.fs.epilog_key); + NIR_PASS(_, nir, agx_nir_lower_sample_mask); + + if (nir->info.fs.uses_sample_shading) { + /* Ensure the sample ID is preserved in register */ + nir_builder b = + nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir))); + nir_export_agx(&b, nir_load_exported_agx(&b, 1, 16, .base = 1), + .base = 1); + + NIR_PASS(_, nir, agx_nir_lower_to_per_sample); + } + + NIR_PASS(_, nir, agx_nir_lower_fs_active_samples_to_register); + NIR_PASS(_, nir, agx_nir_lower_interpolation); + } else if (sw_stage == MESA_SHADER_TESS_EVAL) { + shader->info.ts.ccw = nir->info.tess.ccw; + shader->info.ts.point_mode = nir->info.tess.point_mode; + shader->info.ts.spacing = nir->info.tess.spacing; + shader->info.ts.mode = nir->info.tess._primitive_mode; + + if (nir->info.tess.point_mode) { + shader->info.ts.out_prim = MESA_PRIM_POINTS; + } else if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) { + shader->info.ts.out_prim = MESA_PRIM_LINES; + } else { + shader->info.ts.out_prim = MESA_PRIM_TRIANGLES; + } + + /* This destroys info so it needs to happen after the gather */ + NIR_PASS(_, nir, agx_nir_lower_tes, dev->dev.libagx, hw); + } else if (sw_stage == MESA_SHADER_TESS_CTRL) { + shader->info.tcs.output_patch_size = nir->info.tess.tcs_vertices_out; + shader->info.tcs.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir); + shader->info.tcs.nr_patch_outputs = + util_last_bit(nir->info.patch_outputs_written); + shader->info.tcs.output_stride = agx_tcs_output_stride(nir); + } + + uint64_t outputs = nir->info.outputs_written; + if (!hw && + (sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_TESS_EVAL)) { + nir->info.stage = MESA_SHADER_COMPUTE; + memset(&nir->info.cs, 0, sizeof(nir->info.cs)); + nir->xfb_info = NULL; + } + + /* XXX: rename */ + NIR_PASS(_, nir, hk_lower_uvs_index, vs_uniform_base); + +#if 0 + /* TODO */ + nir_variable_mode robust2_modes = 0; + if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT) + robust2_modes |= nir_var_mem_ubo; + if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT) + robust2_modes |= nir_var_mem_ssbo; +#endif + + struct agx_shader_key backend_key = { + .needs_g13x_coherency = (dev->dev.params.gpu_generation == 13 && + dev->dev.params.num_clusters_total > 1) || + dev->dev.params.num_dies > 1, + .reserved_preamble = 128 /* TODO */, + .libagx = dev->dev.libagx, + .no_stop = nir->info.stage == MESA_SHADER_FRAGMENT, + .has_scratch = true, + }; + + /* For now, sample shading is always dynamic. Indicate that. */ + if (nir->info.stage == MESA_SHADER_FRAGMENT && + nir->info.fs.uses_sample_shading) + backend_key.fs.inside_sample_loop = true; + + agx_compile_shader_nir(nir, &backend_key, NULL, &shader->b); + + shader->code_ptr = shader->b.binary; + shader->code_size = shader->b.binary_size; + + shader->info.stage = sw_stage; + shader->info.clip_distance_array_size = nir->info.clip_distance_array_size; + shader->info.cull_distance_array_size = nir->info.cull_distance_array_size; + shader->b.info.outputs = outputs; + + if (sw_stage == MESA_SHADER_COMPUTE) { + for (unsigned i = 0; i < 3; ++i) + shader->info.cs.local_size[i] = nir->info.workgroup_size[i]; + } + + if (xfb_info) { + assert(xfb_info->output_count < ARRAY_SIZE(shader->info.xfb_outputs)); + + memcpy(&shader->info.xfb_info, xfb_info, + nir_xfb_info_size(xfb_info->output_count)); + + typed_memcpy(shader->info.xfb_stride, nir->info.xfb_stride, 4); + } + + if (nir->constant_data_size > 0) { + uint32_t data_size = align(nir->constant_data_size, HK_MIN_UBO_ALIGNMENT); + + void *data = malloc(data_size); + if (data == NULL) { + ralloc_free(nir); + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + memcpy(data, nir->constant_data, nir->constant_data_size); + + assert(nir->constant_data_size <= data_size); + memset(data + nir->constant_data_size, 0, + data_size - nir->constant_data_size); + + shader->data_ptr = data; + shader->data_size = data_size; + } + + ralloc_free(nir); + + VkResult result = hk_init_link_ht(shader, sw_stage); + if (result != VK_SUCCESS) + return vk_error(dev, result); + + hk_upload_shader(dev, shader); + return VK_SUCCESS; +} + +static const struct vk_shader_ops hk_shader_ops; + +static void +hk_destroy_linked_shader(struct hk_linked_shader *linked) +{ + agx_bo_unreference(linked->b.bo); + ralloc_free(linked); +} + +static void +hk_destroy_linked_shader_ht(struct hash_entry *he) +{ + hk_destroy_linked_shader(he->data); +} + +static void +hk_shader_destroy(struct hk_shader *s) +{ + free((void *)s->code_ptr); + free((void *)s->data_ptr); + agx_bo_unreference(s->bo); + + simple_mtx_destroy(&s->linked.lock); + _mesa_hash_table_destroy(s->linked.ht, hk_destroy_linked_shader_ht); + + if (s->only_linked) + hk_destroy_linked_shader(s->only_linked); +} + +void +hk_api_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader, + const VkAllocationCallbacks *pAllocator) +{ + struct hk_device *dev = container_of(vk_dev, struct hk_device, vk); + struct hk_api_shader *obj = + container_of(vk_shader, struct hk_api_shader, vk); + + hk_foreach_variant(obj, shader) { + hk_shader_destroy(shader); + } + + vk_shader_free(&dev->vk, pAllocator, &obj->vk); +} + +static void +hk_lower_hw_vs(nir_shader *nir, struct hk_shader *shader) +{ + /* Point size must be clamped, excessively large points don't render + * properly on G13. + * + * Must be synced with pointSizeRange. + */ + NIR_PASS(_, nir, nir_lower_point_size, 1.0f, 511.95f); + + /* TODO: Optimize out for monolithic? */ + NIR_PASS(_, nir, hk_nir_insert_psiz_write); + + NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); + NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs); + + NIR_PASS(_, nir, agx_nir_lower_uvs, &shader->info.uvs); + + shader->info.vs.cull_distance_array_size = + nir->info.cull_distance_array_size; +} + +VkResult +hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info, + const struct vk_graphics_pipeline_state *state, + const VkAllocationCallbacks *pAllocator, + struct hk_api_shader **shader_out) +{ + VkResult result; + + /* We consume the NIR, regardless of success or failure */ + nir_shader *nir = info->nir; + + size_t size = sizeof(struct hk_api_shader) + + sizeof(struct hk_shader) * hk_num_variants(info->stage); + struct hk_api_shader *obj = + vk_shader_zalloc(&dev->vk, &hk_shader_ops, info->stage, pAllocator, size); + + if (obj == NULL) { + ralloc_free(nir); + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + /* TODO: Multiview with ESO */ + const bool is_multiview = state && state->rp->view_mask != 0; + + hk_lower_nir(dev, nir, info->robustness, is_multiview, + info->set_layout_count, info->set_layouts); + + gl_shader_stage sw_stage = nir->info.stage; + + struct hk_fs_key fs_key_tmp, *fs_key = NULL; + if (sw_stage == MESA_SHADER_FRAGMENT) { + hk_populate_fs_key(&fs_key_tmp, state); + fs_key = &fs_key_tmp; + + nir->info.fs.uses_sample_shading |= fs_key->force_sample_shading; + + /* Force late-Z for Z/S self-deps. TODO: There's probably a less silly way + * to do this. + */ + if (fs_key->zs_self_dep) { + nir_builder b = + nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(nir))); + nir_discard_if(&b, nir_imm_false(&b)); + nir->info.fs.uses_discard = true; + } + + NIR_PASS(_, nir, agx_nir_lower_sample_intrinsics, false); + } else if (sw_stage == MESA_SHADER_TESS_CTRL) { + NIR_PASS_V(nir, agx_nir_lower_tcs, dev->dev.libagx); + } + + /* Compile all variants up front */ + if (sw_stage == MESA_SHADER_GEOMETRY) { + for (unsigned rast_disc = 0; rast_disc < 2; ++rast_disc) { + struct hk_shader *count_variant = hk_count_gs_variant(obj, rast_disc); + nir_shader *clone = nir_shader_clone(NULL, nir); + + enum mesa_prim out_prim = MESA_PRIM_MAX; + nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL; + + NIR_PASS(_, clone, agx_nir_lower_gs, dev->dev.libagx, rast_disc, + &count, &rast, &pre_gs, &out_prim, + &count_variant->info.gs.count_words); + + if (!rast_disc) { + struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST]; + + hk_lower_hw_vs(rast, shader); + shader->info.gs.out_prim = out_prim; + } + + struct { + nir_shader *in; + struct hk_shader *out; + } variants[] = { + {clone, hk_main_gs_variant(obj, rast_disc)}, + {pre_gs, hk_pre_gs_variant(obj, rast_disc)}, + {count, count_variant}, + {rast_disc ? NULL : rast, &obj->variants[HK_GS_VARIANT_RAST]}, + }; + + for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) { + if (variants[v].in) { + result = hk_compile_nir(dev, pAllocator, variants[v].in, + info->flags, info->robustness, NULL, + variants[v].out, sw_stage, true, NULL); + if (result != VK_SUCCESS) { + hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator); + ralloc_free(nir); + return result; + } + } + } + } + } else if (sw_stage == MESA_SHADER_VERTEX || + sw_stage == MESA_SHADER_TESS_EVAL) { + + if (sw_stage == MESA_SHADER_VERTEX) { + assert( + !(nir->info.inputs_read & BITFIELD64_MASK(VERT_ATTRIB_GENERIC0)) && + "Fixed-function attributes not used in Vulkan"); + + NIR_PASS(_, nir, nir_recompute_io_bases, nir_var_shader_in); + } + + /* the shader_out portion of this is load-bearing even for tess eval */ + NIR_PASS(_, nir, nir_io_add_const_offset_to_base, + nir_var_shader_in | nir_var_shader_out); + + for (enum hk_vs_variant v = 0; v < HK_VS_VARIANTS; ++v) { + struct hk_shader *shader = &obj->variants[v]; + bool hw = v == HK_VS_VARIANT_HW; + + /* TODO: Optimize single variant when we know nextStage */ + nir_shader *clone = nir_shader_clone(NULL, nir); + + if (sw_stage == MESA_SHADER_VERTEX) { + NIR_PASS(_, clone, agx_nir_lower_vs_input_to_prolog, + shader->info.vs.attrib_components_read); + + shader->info.vs.attribs_read = + nir->info.inputs_read >> VERT_ATTRIB_GENERIC0; + } + + if (hw) { + hk_lower_hw_vs(clone, shader); + } else { + NIR_PASS(_, clone, agx_nir_lower_vs_before_gs, dev->dev.libagx); + } + + result = hk_compile_nir(dev, pAllocator, clone, info->flags, + info->robustness, fs_key, shader, sw_stage, hw, + nir->xfb_info); + if (result != VK_SUCCESS) { + hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator); + ralloc_free(nir); + return result; + } + } + } else { + struct hk_shader *shader = hk_only_variant(obj); + + /* hk_compile_nir takes ownership of nir */ + result = + hk_compile_nir(dev, pAllocator, nir, info->flags, info->robustness, + fs_key, shader, sw_stage, true, NULL); + if (result != VK_SUCCESS) { + hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator); + return result; + } + } + + *shader_out = obj; + return VK_SUCCESS; +} + +static VkResult +hk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count, + struct vk_shader_compile_info *infos, + const struct vk_graphics_pipeline_state *state, + const VkAllocationCallbacks *pAllocator, + struct vk_shader **shaders_out) +{ + struct hk_device *dev = container_of(vk_dev, struct hk_device, vk); + + for (uint32_t i = 0; i < shader_count; i++) { + VkResult result = + hk_compile_shader(dev, &infos[i], state, pAllocator, + (struct hk_api_shader **)&shaders_out[i]); + if (result != VK_SUCCESS) { + /* Clean up all the shaders before this point */ + for (uint32_t j = 0; j < i; j++) + hk_api_shader_destroy(&dev->vk, shaders_out[j], pAllocator); + + /* Clean up all the NIR after this point */ + for (uint32_t j = i + 1; j < shader_count; j++) + ralloc_free(infos[j].nir); + + /* Memset the output array */ + memset(shaders_out, 0, shader_count * sizeof(*shaders_out)); + + return result; + } + } + + return VK_SUCCESS; +} + +static VkResult +hk_deserialize_shader(struct hk_device *dev, struct blob_reader *blob, + struct hk_shader *shader) +{ + struct hk_shader_info info; + blob_copy_bytes(blob, &info, sizeof(info)); + + struct agx_shader_info b_info; + blob_copy_bytes(blob, &b_info, sizeof(b_info)); + + const uint32_t code_size = blob_read_uint32(blob); + const uint32_t data_size = blob_read_uint32(blob); + if (blob->overrun) + return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT); + + VkResult result = hk_init_link_ht(shader, info.stage); + if (result != VK_SUCCESS) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + simple_mtx_init(&shader->linked.lock, mtx_plain); + + shader->b.info = b_info; + shader->info = info; + shader->code_size = code_size; + shader->data_size = data_size; + shader->b.binary_size = code_size; + + shader->code_ptr = malloc(code_size); + if (shader->code_ptr == NULL) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + shader->data_ptr = malloc(data_size); + if (shader->data_ptr == NULL) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size); + blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size); + if (blob->overrun) + return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT); + + shader->b.binary = (void *)shader->code_ptr; + hk_upload_shader(dev, shader); + return VK_SUCCESS; +} + +static VkResult +hk_deserialize_api_shader(struct vk_device *vk_dev, struct blob_reader *blob, + uint32_t binary_version, + const VkAllocationCallbacks *pAllocator, + struct vk_shader **shader_out) +{ + struct hk_device *dev = container_of(vk_dev, struct hk_device, vk); + + gl_shader_stage stage = blob_read_uint8(blob); + if (blob->overrun) + return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT); + + size_t size = sizeof(struct hk_api_shader) + + sizeof(struct hk_shader) * hk_num_variants(stage); + + struct hk_api_shader *obj = + vk_shader_zalloc(&dev->vk, &hk_shader_ops, stage, pAllocator, size); + + if (obj == NULL) + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + + hk_foreach_variant(obj, shader) { + VkResult result = hk_deserialize_shader(dev, blob, shader); + + if (result != VK_SUCCESS) { + hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator); + return result; + } + } + + *shader_out = &obj->vk; + return VK_SUCCESS; +} + +static void +hk_shader_serialize(struct vk_device *vk_dev, const struct hk_shader *shader, + struct blob *blob) +{ + blob_write_bytes(blob, &shader->info, sizeof(shader->info)); + blob_write_bytes(blob, &shader->b.info, sizeof(shader->b.info)); + + blob_write_uint32(blob, shader->code_size); + blob_write_uint32(blob, shader->data_size); + blob_write_bytes(blob, shader->code_ptr, shader->code_size); + blob_write_bytes(blob, shader->data_ptr, shader->data_size); +} + +static bool +hk_api_shader_serialize(struct vk_device *vk_dev, + const struct vk_shader *vk_shader, struct blob *blob) +{ + struct hk_api_shader *obj = + container_of(vk_shader, struct hk_api_shader, vk); + + blob_write_uint8(blob, vk_shader->stage); + + hk_foreach_variant(obj, shader) { + hk_shader_serialize(vk_dev, shader, blob); + } + + return !blob->out_of_memory; +} + +#define WRITE_STR(field, ...) \ + ({ \ + memset(field, 0, sizeof(field)); \ + UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \ + assert(i > 0 && i < sizeof(field)); \ + }) + +static VkResult +hk_shader_get_executable_properties( + UNUSED struct vk_device *device, const struct vk_shader *vk_shader, + uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties) +{ + struct hk_api_shader *obj = + container_of(vk_shader, struct hk_api_shader, vk); + + VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties, + executable_count); + + vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) + { + props->stages = mesa_to_vk_shader_stage(obj->vk.stage); + props->subgroupSize = 32; + WRITE_STR(props->name, "%s", _mesa_shader_stage_to_string(obj->vk.stage)); + WRITE_STR(props->description, "%s shader", + _mesa_shader_stage_to_string(obj->vk.stage)); + } + + return vk_outarray_status(&out); +} + +static VkResult +hk_shader_get_executable_statistics( + UNUSED struct vk_device *device, const struct vk_shader *vk_shader, + uint32_t executable_index, uint32_t *statistic_count, + VkPipelineExecutableStatisticKHR *statistics) +{ + struct hk_api_shader *obj = + container_of(vk_shader, struct hk_api_shader, vk); + + VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics, + statistic_count); + + assert(executable_index == 0); + + /* TODO: find a sane way to report multiple variants and have that play nice + * with zink. + */ + struct hk_shader *shader = hk_any_variant(obj); + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) + { + WRITE_STR(stat->name, "Code Size"); + WRITE_STR(stat->description, + "Size of the compiled shader binary, in bytes"); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = shader->code_size; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) + { + WRITE_STR(stat->name, "Number of GPRs"); + WRITE_STR(stat->description, "Number of GPRs used by this pipeline"); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = shader->b.info.nr_gprs; + } + + return vk_outarray_status(&out); +} + +static bool +write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir, + const char *data) +{ + ir->isText = VK_TRUE; + + size_t data_len = strlen(data) + 1; + + if (ir->pData == NULL) { + ir->dataSize = data_len; + return true; + } + + strncpy(ir->pData, data, ir->dataSize); + if (ir->dataSize < data_len) + return false; + + ir->dataSize = data_len; + return true; +} + +static VkResult +hk_shader_get_executable_internal_representations( + UNUSED struct vk_device *device, const struct vk_shader *vk_shader, + uint32_t executable_index, uint32_t *internal_representation_count, + VkPipelineExecutableInternalRepresentationKHR *internal_representations) +{ + VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out, + internal_representations, + internal_representation_count); + bool incomplete_text = false; + + assert(executable_index == 0); + + /* TODO */ +#if 0 + vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { + WRITE_STR(ir->name, "AGX assembly"); + WRITE_STR(ir->description, "AGX assembly"); + if (!write_ir_text(ir, TODO)) + incomplete_text = true; + } +#endif + + return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out); +} + +static const struct vk_shader_ops hk_shader_ops = { + .destroy = hk_api_shader_destroy, + .serialize = hk_api_shader_serialize, + .get_executable_properties = hk_shader_get_executable_properties, + .get_executable_statistics = hk_shader_get_executable_statistics, + .get_executable_internal_representations = + hk_shader_get_executable_internal_representations, +}; + +const struct vk_device_shader_ops hk_device_shader_ops = { + .get_nir_options = hk_get_nir_options, + .get_spirv_options = hk_get_spirv_options, + .preprocess_nir = hk_preprocess_nir, + .hash_graphics_state = hk_hash_graphics_state, + .compile = hk_compile_shaders, + .deserialize = hk_deserialize_api_shader, + .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state, + .cmd_bind_shaders = hk_cmd_bind_shaders, +}; + +struct hk_linked_shader * +hk_fast_link(struct hk_device *dev, bool fragment, struct hk_shader *main, + struct agx_shader_part *prolog, struct agx_shader_part *epilog, + unsigned nr_samples_shaded) +{ + struct hk_linked_shader *s = rzalloc(NULL, struct hk_linked_shader); + agx_fast_link(&s->b, &dev->dev, fragment, &main->b, prolog, epilog, + nr_samples_shaded); + + if (fragment) { + agx_pack(&s->fs_counts, FRAGMENT_SHADER_WORD_0, cfg) { + cfg.cf_binding_count = s->b.cf.nr_bindings; + cfg.uniform_register_count = main->b.info.push_count; + cfg.preshader_register_count = main->b.info.nr_preamble_gprs; + cfg.sampler_state_register_count = + agx_translate_sampler_state_count(s->b.uses_txf ? 1 : 0, false); + } + } + + /* Now that we've linked, bake the USC words to bind this program */ + struct agx_usc_builder b = agx_usc_builder(s->usc.data, sizeof(s->usc.data)); + + if (main && main->b.info.immediate_size_16) { + unreachable("todo"); +#if 0 + /* XXX: do ahead of time */ + uint64_t ptr = agx_pool_upload_aligned( + &cmd->pool, s->b.info.immediates, s->b.info.immediate_size_16 * 2, 64); + + for (unsigned range = 0; range < constant_push_ranges; ++range) { + unsigned offset = 64 * range; + assert(offset < s->b.info.immediate_size_16); + + agx_usc_uniform(&b, s->b.info.immediate_base_uniform + offset, + MIN2(64, s->b.info.immediate_size_16 - offset), + ptr + (offset * 2)); + } +#endif + } + + agx_usc_push_packed(&b, UNIFORM, dev->rodata.image_heap); + + if (s->b.uses_txf) + agx_usc_push_packed(&b, SAMPLER, dev->rodata.txf_sampler); + + if (main && (main->b.info.stage == MESA_SHADER_COMPUTE || + main->b.info.stage == MESA_SHADER_TESS_CTRL)) { + unsigned size = main->b.info.local_size; + + agx_usc_pack(&b, SHARED, cfg) { + cfg.layout = AGX_SHARED_LAYOUT_VERTEX_COMPUTE; + cfg.bytes_per_threadgroup = size > 0 ? size : 65536; + cfg.uses_shared_memory = size > 0; + } + } else if (!fragment) { + agx_usc_shared_none(&b); + } + + agx_usc_push_packed(&b, SHADER, s->b.shader); + agx_usc_push_packed(&b, REGISTERS, s->b.regs); + + if (fragment) + agx_usc_push_packed(&b, FRAGMENT_PROPERTIES, s->b.fragment_props); + + if (main && main->b.info.has_preamble) { + agx_usc_pack(&b, PRESHADER, cfg) { + cfg.code = main->preamble_addr; + } + } else { + agx_usc_pack(&b, NO_PRESHADER, cfg) + ; + } + + s->usc.size = b.head - s->usc.data; + return s; +} diff --git a/src/asahi/vulkan/hk_shader.h b/src/asahi/vulkan/hk_shader.h new file mode 100644 index 00000000000..458266f8365 --- /dev/null +++ b/src/asahi/vulkan/hk_shader.h @@ -0,0 +1,400 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "asahi/compiler/agx_compile.h" +#include "util/macros.h" +#include "agx_linker.h" +#include "agx_nir_lower_vbo.h" +#include "agx_pack.h" +#include "agx_usc.h" +#include "agx_uvs.h" + +#include "hk_device.h" +#include "hk_device_memory.h" +#include "hk_private.h" + +#include "nir_xfb_info.h" +#include "shader_enums.h" +#include "vk_pipeline_cache.h" + +#include "nir.h" + +#include "vk_shader.h" + +struct hk_physical_device; +struct hk_pipeline_compilation_ctx; +struct vk_descriptor_set_layout; +struct vk_graphics_pipeline_state; +struct vk_pipeline_cache; +struct vk_pipeline_layout; +struct vk_pipeline_robustness_state; +struct vk_shader_module; + +/* TODO: Make dynamic */ +#define HK_ROOT_UNIFORM 104 +#define HK_IMAGE_HEAP_UNIFORM 108 + +struct hk_shader_info { + union { + struct { + uint32_t attribs_read; + BITSET_DECLARE(attrib_components_read, AGX_MAX_ATTRIBS * 4); + uint8_t cull_distance_array_size; + uint8_t _pad[7]; + } vs; + + struct { + /* Local workgroup size */ + uint16_t local_size[3]; + + uint8_t _pad[26]; + } cs; + + struct { + struct agx_interp_info interp; + struct agx_fs_epilog_link_info epilog_key; + + bool reads_sample_mask; + bool post_depth_coverage; + bool uses_sample_shading; + bool early_fragment_tests; + bool writes_memory; + + uint8_t _pad[7]; + } fs; + + struct { + uint8_t spacing; + uint8_t mode; + enum mesa_prim out_prim; + bool point_mode; + bool ccw; + uint8_t _pad[27]; + } ts; + + struct { + uint64_t per_vertex_outputs; + uint32_t output_stride; + uint8_t output_patch_size; + uint8_t nr_patch_outputs; + uint8_t _pad[18]; + } tcs; + + struct { + unsigned count_words; + enum mesa_prim out_prim; + uint8_t _pad[27]; + } gs; + + /* Used to initialize the union for other stages */ + uint8_t _pad[32]; + }; + + struct agx_unlinked_uvs_layout uvs; + + /* Transform feedback buffer strides */ + uint8_t xfb_stride[MAX_XFB_BUFFERS]; + + gl_shader_stage stage : 8; + uint8_t clip_distance_array_size; + uint8_t cull_distance_array_size; + uint8_t _pad0[1]; + + /* XXX: is there a less goofy way to do this? I really don't want dynamic + * allocation here. + */ + nir_xfb_info xfb_info; + nir_xfb_output_info xfb_outputs[64]; +}; + +/* + * Hash table keys for fast-linked shader variants. These contain the entire + * prolog/epilog key so we only do 1 hash table lookup instead of 2 in the + * general case where the linked shader is already ready. + */ +struct hk_fast_link_key_vs { + struct agx_vs_prolog_key prolog; +}; + +struct hk_fast_link_key_fs { + unsigned nr_samples_shaded; + struct agx_fs_prolog_key prolog; + struct agx_fs_epilog_key epilog; +}; + +struct hk_shader { + struct agx_shader_part b; + + struct hk_shader_info info; + struct agx_fragment_face_2_packed frag_face; + struct agx_counts_packed counts; + + const void *code_ptr; + uint32_t code_size; + + const void *data_ptr; + uint32_t data_size; + + /* BO for any uploaded shader part */ + struct agx_bo *bo; + + /* Cache of fast linked variants */ + struct { + simple_mtx_t lock; + struct hash_table *ht; + } linked; + + /* If there's only a single possibly linked variant, direct pointer. TODO: + * Union with the cache to save some space? + */ + struct hk_linked_shader *only_linked; + + /* Address to the uploaded preamble section. Preambles are uploaded + * separately from fast-linked main shaders. + */ + uint64_t preamble_addr; + + /* Address of the start of the shader data section */ + uint64_t data_addr; +}; + +enum hk_vs_variant { + /* Hardware vertex shader, when next stage is fragment */ + HK_VS_VARIANT_HW, + + /* Hardware compute shader, when next is geometry/tessellation */ + HK_VS_VARIANT_SW, + + HK_VS_VARIANTS, +}; + +enum hk_gs_variant { + /* Hardware vertex shader used for rasterization */ + HK_GS_VARIANT_RAST, + + /* Main compute shader */ + HK_GS_VARIANT_MAIN, + HK_GS_VARIANT_MAIN_NO_RAST, + + /* Count compute shader */ + HK_GS_VARIANT_COUNT, + HK_GS_VARIANT_COUNT_NO_RAST, + + /* Pre-GS compute shader */ + HK_GS_VARIANT_PRE, + HK_GS_VARIANT_PRE_NO_RAST, + + HK_GS_VARIANTS, +}; + +/* clang-format off */ +static const char *hk_gs_variant_name[] = { + [HK_GS_VARIANT_RAST] = "Rasterization", + [HK_GS_VARIANT_MAIN] = "Main", + [HK_GS_VARIANT_MAIN_NO_RAST] = "Main (rast. discard)", + [HK_GS_VARIANT_COUNT] = "Count", + [HK_GS_VARIANT_COUNT_NO_RAST] = "Count (rast. discard)", + [HK_GS_VARIANT_PRE] = "Pre-GS", + [HK_GS_VARIANT_PRE_NO_RAST] = "Pre-GS (rast. discard)", +}; +/* clang-format on */ + +static inline unsigned +hk_num_variants(gl_shader_stage stage) +{ + switch (stage) { + case MESA_SHADER_VERTEX: + case MESA_SHADER_TESS_EVAL: + return HK_VS_VARIANTS; + + case MESA_SHADER_GEOMETRY: + return HK_GS_VARIANTS; + + default: + return 1; + } +} + +/* + * An hk_api shader maps 1:1 to a VkShader object. An hk_api_shader may contain + * multiple hardware hk_shader's, built at shader compile time. This complexity + * is required to efficiently implement the legacy geometry pipeline. + */ +struct hk_api_shader { + struct vk_shader vk; + + /* Is this an internal passthrough geometry shader? */ + bool is_passthrough; + + struct hk_shader variants[]; +}; + +#define hk_foreach_variant(api_shader, var) \ + for (struct hk_shader *var = api_shader->variants; \ + var < api_shader->variants + hk_num_variants(api_shader->vk.stage); \ + ++var) + +static const char * +hk_variant_name(struct hk_api_shader *obj, struct hk_shader *variant) +{ + unsigned i = variant - obj->variants; + assert(i < hk_num_variants(obj->vk.stage)); + + if (hk_num_variants(obj->vk.stage) == 1) { + return NULL; + } else if (obj->vk.stage == MESA_SHADER_GEOMETRY) { + assert(i < ARRAY_SIZE(hk_gs_variant_name)); + return hk_gs_variant_name[i]; + } else { + assert(i < 2); + return i == HK_VS_VARIANT_SW ? "Software" : "Hardware"; + } +} + +static struct hk_shader * +hk_only_variant(struct hk_api_shader *obj) +{ + if (!obj) + return NULL; + + assert(hk_num_variants(obj->vk.stage) == 1); + return &obj->variants[0]; +} + +static struct hk_shader * +hk_any_variant(struct hk_api_shader *obj) +{ + if (!obj) + return NULL; + + return &obj->variants[0]; +} + +static struct hk_shader * +hk_main_gs_variant(struct hk_api_shader *obj, bool rast_disc) +{ + return &obj->variants[HK_GS_VARIANT_MAIN + rast_disc]; +} + +static struct hk_shader * +hk_count_gs_variant(struct hk_api_shader *obj, bool rast_disc) +{ + return &obj->variants[HK_GS_VARIANT_COUNT + rast_disc]; +} + +static struct hk_shader * +hk_pre_gs_variant(struct hk_api_shader *obj, bool rast_disc) +{ + return &obj->variants[HK_GS_VARIANT_PRE + rast_disc]; +} + +#define HK_MAX_LINKED_USC_SIZE \ + (AGX_USC_PRESHADER_LENGTH + AGX_USC_FRAGMENT_PROPERTIES_LENGTH + \ + AGX_USC_REGISTERS_LENGTH + AGX_USC_SHADER_LENGTH + AGX_USC_SHARED_LENGTH + \ + AGX_USC_SAMPLER_LENGTH + (AGX_USC_UNIFORM_LENGTH * 9)) + +struct hk_linked_shader { + struct agx_linked_shader b; + + /* Distinct from hk_shader::counts due to addition of cf_binding_count, which + * is delayed since it depends on cull distance. + */ + struct agx_fragment_shader_word_0_packed fs_counts; + + /* Baked USC words to bind this linked shader */ + struct { + uint8_t data[HK_MAX_LINKED_USC_SIZE]; + size_t size; + } usc; +}; + +struct hk_linked_shader *hk_fast_link(struct hk_device *dev, bool fragment, + struct hk_shader *main, + struct agx_shader_part *prolog, + struct agx_shader_part *epilog, + unsigned nr_samples_shaded); + +extern const struct vk_device_shader_ops hk_device_shader_ops; + +uint64_t +hk_physical_device_compiler_flags(const struct hk_physical_device *pdev); + +static inline nir_address_format +hk_buffer_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness) +{ + switch (robustness) { + case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT: + return nir_address_format_64bit_global_32bit_offset; + case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT: + case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT: + return nir_address_format_64bit_bounded_global; + default: + unreachable("Invalid robust buffer access behavior"); + } +} + +bool hk_lower_uvs_index(nir_shader *s, unsigned vs_uniform_base); + +bool +hk_nir_lower_descriptors(nir_shader *nir, + const struct vk_pipeline_robustness_state *rs, + uint32_t set_layout_count, + struct vk_descriptor_set_layout *const *set_layouts); +void hk_lower_nir(struct hk_device *dev, nir_shader *nir, + const struct vk_pipeline_robustness_state *rs, + bool is_multiview, uint32_t set_layout_count, + struct vk_descriptor_set_layout *const *set_layouts); + +VkResult hk_compile_shader(struct hk_device *dev, + struct vk_shader_compile_info *info, + const struct vk_graphics_pipeline_state *state, + const VkAllocationCallbacks *pAllocator, + struct hk_api_shader **shader_out); + +void hk_preprocess_nir_internal(struct vk_physical_device *vk_pdev, + nir_shader *nir); + +void hk_api_shader_destroy(struct vk_device *vk_dev, + struct vk_shader *vk_shader, + const VkAllocationCallbacks *pAllocator); + +const nir_shader_compiler_options * +hk_get_nir_options(struct vk_physical_device *vk_pdev, gl_shader_stage stage, + UNUSED const struct vk_pipeline_robustness_state *rs); + +struct hk_api_shader *hk_meta_shader(struct hk_device *dev, + hk_internal_builder_t builder, void *data, + size_t data_size); + +static inline struct hk_shader * +hk_meta_kernel(struct hk_device *dev, hk_internal_builder_t builder, void *data, + size_t data_size) +{ + return hk_only_variant(hk_meta_shader(dev, builder, data, data_size)); +} + +struct hk_passthrough_gs_key { + /* Bit mask of outputs written by the VS/TES, to be passed through */ + uint64_t outputs; + + /* Clip/cull sizes, implies clip/cull written in output */ + uint8_t clip_distance_array_size; + uint8_t cull_distance_array_size; + + /* Transform feedback buffer strides */ + uint8_t xfb_stride[MAX_XFB_BUFFERS]; + + /* Decomposed primitive */ + enum mesa_prim prim; + + /* Transform feedback info. Must add nir_xfb_info_size to get the key size */ + nir_xfb_info xfb_info; +}; + +void hk_nir_passthrough_gs(struct nir_builder *b, const void *key_); diff --git a/src/asahi/vulkan/hk_wsi.c b/src/asahi/vulkan/hk_wsi.c new file mode 100644 index 00000000000..b95d09a7d97 --- /dev/null +++ b/src/asahi/vulkan/hk_wsi.c @@ -0,0 +1,44 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ +#include "hk_wsi.h" +#include "hk_instance.h" +#include "wsi_common.h" + +static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL +hk_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName) +{ + VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice); + return vk_instance_get_proc_addr_unchecked(pdev->vk.instance, pName); +} + +VkResult +hk_init_wsi(struct hk_physical_device *pdev) +{ + VkResult result; + + struct wsi_device_options wsi_options = {.sw_device = false}; + result = wsi_device_init( + &pdev->wsi_device, hk_physical_device_to_handle(pdev), hk_wsi_proc_addr, + &pdev->vk.instance->alloc, pdev->master_fd, + &hk_physical_device_instance(pdev)->dri_options, &wsi_options); + if (result != VK_SUCCESS) + return result; + + pdev->wsi_device.supports_scanout = false; + pdev->wsi_device.supports_modifiers = true; + + pdev->vk.wsi_device = &pdev->wsi_device; + + return result; +} + +void +hk_finish_wsi(struct hk_physical_device *pdev) +{ + pdev->vk.wsi_device = NULL; + wsi_device_finish(&pdev->wsi_device, &pdev->vk.instance->alloc); +} diff --git a/src/asahi/vulkan/hk_wsi.h b/src/asahi/vulkan/hk_wsi.h new file mode 100644 index 00000000000..458f0cd1616 --- /dev/null +++ b/src/asahi/vulkan/hk_wsi.h @@ -0,0 +1,13 @@ +/* + * Copyright 2024 Valve Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "hk_physical_device.h" + +VkResult hk_init_wsi(struct hk_physical_device *pdev); +void hk_finish_wsi(struct hk_physical_device *pdev); diff --git a/src/asahi/vulkan/meson.build b/src/asahi/vulkan/meson.build new file mode 100644 index 00000000000..7b66cf2c1f0 --- /dev/null +++ b/src/asahi/vulkan/meson.build @@ -0,0 +1,142 @@ +# Copyright © 2022 Collabora Ltd. and Red Hat Inc. +# SPDX-License-Identifier: MIT +hk_files = files( + 'hk_buffer.c', + 'hk_buffer.h', + 'hk_buffer_view.c', + 'hk_buffer_view.h', + 'hk_cmd_buffer.c', + 'hk_cmd_buffer.h', + 'hk_cmd_clear.c', + 'hk_cmd_dispatch.c', + 'hk_cmd_draw.c', + 'hk_cmd_meta.c', + 'hk_cmd_pool.c', + 'hk_cmd_pool.h', + 'hk_descriptor_set.h', + 'hk_descriptor_set.c', + 'hk_descriptor_set_layout.c', + 'hk_descriptor_set_layout.h', + 'hk_descriptor_table.c', + 'hk_descriptor_table.h', + 'hk_device.c', + 'hk_device.h', + 'hk_device_memory.c', + 'hk_device_memory.h', + 'hk_event.c', + 'hk_event.h', + 'hk_format.c', + 'hk_image.c', + 'hk_image.h', + 'hk_image_view.c', + 'hk_image_view.h', + 'hk_instance.c', + 'hk_instance.h', + 'hk_nir_lower_descriptors.c', + 'hk_nir_passthrough_gs.c', + 'hk_physical_device.c', + 'hk_physical_device.h', + 'hk_private.h', + 'hk_query_pool.c', + 'hk_query_pool.h', + 'hk_queue.c', + 'hk_queue.h', + 'hk_sampler.c', + 'hk_sampler.h', + 'hk_shader.c', + 'hk_shader.h', + 'hk_wsi.c', + 'hk_wsi.h' +) + +hk_entrypoints = custom_target( + 'hk_entrypoints', + input : [vk_entrypoints_gen, vk_api_xml], + output : ['hk_entrypoints.h', 'hk_entrypoints.c'], + command : [ + prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak', + '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'hk', + '--beta', with_vulkan_beta.to_string(), + ], + depend_files : vk_entrypoints_gen_depend_files, +) + +hk_deps = [ + dep_libdrm, + idep_nir, + idep_vulkan_runtime, + idep_vulkan_util, + idep_vulkan_wsi, + idep_vulkan_wsi_headers, + idep_agx_pack, +] + +libhk = static_library( + 'hk', + [ + hk_entrypoints, + hk_files, + libagx_shaders, + sha1_h, + ], + include_directories : [ + inc_gallium, + inc_gallium_aux, + inc_include, + inc_src, + inc_asahi, + ], + link_with : [libasahi_lib, libasahi_layout, libasahi_compiler], + c_args : ['-Wno-c2x-extensions'], + dependencies : [hk_deps], + gnu_symbol_visibility : 'hidden', +) + +libvulkan_asahi = shared_library( + 'vulkan_asahi', + link_whole : [libhk], + link_args: [ld_args_build_id], + gnu_symbol_visibility : 'hidden', + install : true, +) + +icd_lib_path = join_paths(get_option('prefix'), get_option('libdir')) +icd_file_name = 'libvulkan_asahi.so' +if with_platform_windows + icd_lib_path = import('fs').relative_to(get_option('bindir'), with_vulkan_icd_dir) + icd_file_name = 'vulkan_asahi.dll' +endif + +asahi_icd = custom_target( + 'asahi_icd', + input : [vk_icd_gen, vk_api_xml], + output : 'asahi_icd.@0@.json'.format(host_machine.cpu()), + command : [ + prog_python, '@INPUT0@', + '--api-version', '1.3', '--xml', '@INPUT1@', + '--lib-path', join_paths(icd_lib_path, icd_file_name), + '--out', '@OUTPUT@', + ], + build_by_default : true, + install_dir : with_vulkan_icd_dir, + install_tag : 'runtime', + install : true, +) + +_dev_icdname = 'asahi_devenv_icd.@0@.json'.format(host_machine.cpu()) +custom_target( + 'asahi_devenv_icd', + input : [vk_icd_gen, vk_api_xml], + output : _dev_icdname, + command : [ + prog_python, '@INPUT0@', + '--api-version', '1.3', '--xml', '@INPUT1@', + '--lib-path', meson.current_build_dir() / icd_file_name, + '--out', '@OUTPUT@', + ], + build_by_default : true, +) + +devenv.append('VK_DRIVER_FILES', meson.current_build_dir() / _dev_icdname) +# Deprecated: replaced by VK_DRIVER_FILES above +devenv.append('VK_ICD_FILENAMES', meson.current_build_dir() / _dev_icdname)