mesa/src/intel/vulkan/anv_device.c
Lionel Landwerlin 3584fc6482 anv: use weak_ref mode for global pipeline caches
So that as soon as pipelines are freed, they're removed from the
cache.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11185
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Tested-by: Brian Paul <brian.paul@broadcom.com>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Cc: mesa-stable
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29283>
2024-05-22 15:22:56 +00:00

5693 lines
222 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <assert.h>
#include <inttypes.h>
#include <stdbool.h>
#include <string.h>
#ifdef MAJOR_IN_MKDEV
#include <sys/mkdev.h>
#endif
#ifdef MAJOR_IN_SYSMACROS
#include <sys/sysmacros.h>
#endif
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include "drm-uapi/drm_fourcc.h"
#include "drm-uapi/drm.h"
#include <xf86drm.h>
#include "anv_private.h"
#include "anv_measure.h"
#include "util/u_debug.h"
#include "util/build_id.h"
#include "util/disk_cache.h"
#include "util/mesa-sha1.h"
#include "util/os_file.h"
#include "util/os_misc.h"
#include "util/u_atomic.h"
#if DETECT_OS_ANDROID
#include "util/u_gralloc/u_gralloc.h"
#endif
#include "util/u_string.h"
#include "util/driconf.h"
#include "git_sha1.h"
#include "vk_common_entrypoints.h"
#include "vk_util.h"
#include "vk_deferred_operation.h"
#include "vk_drm_syncobj.h"
#include "common/intel_aux_map.h"
#include "common/intel_debug_identifier.h"
#include "common/intel_uuid.h"
#include "perf/intel_perf.h"
#include "i915/anv_device.h"
#include "xe/anv_device.h"
#include "xe/anv_queue.h"
#include "genxml/gen7_pack.h"
#include "genxml/genX_bits.h"
static const driOptionDescription anv_dri_options[] = {
DRI_CONF_SECTION_PERFORMANCE
DRI_CONF_ADAPTIVE_SYNC(true)
DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0)
DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false)
DRI_CONF_VK_KHR_PRESENT_WAIT(false)
DRI_CONF_VK_XWAYLAND_WAIT_READY(false)
DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(0)
DRI_CONF_ANV_DISABLE_FCV(false)
DRI_CONF_ANV_EXTERNAL_MEMORY_IMPLICIT_SYNC(true)
DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(false)
DRI_CONF_ANV_FORCE_FILTER_ADDR_ROUNDING(false)
DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false)
DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4)
DRI_CONF_ANV_GENERATED_INDIRECT_RING_THRESHOLD(100)
DRI_CONF_NO_16BIT(false)
DRI_CONF_INTEL_ENABLE_WA_14018912822(false)
DRI_CONF_ANV_QUERY_CLEAR_WITH_BLORP_THRESHOLD(6)
DRI_CONF_ANV_QUERY_COPY_WITH_SHADER_THRESHOLD(6)
DRI_CONF_ANV_FORCE_INDIRECT_DESCRIPTORS(false)
DRI_CONF_SHADER_SPILLING_RATE(0)
DRI_CONF_OPT_B(intel_tbimr, true, "Enable TBIMR tiled rendering")
DRI_CONF_ANV_COMPRESSION_CONTROL_ENABLED(false)
DRI_CONF_SECTION_END
DRI_CONF_SECTION_DEBUG
DRI_CONF_ALWAYS_FLUSH_CACHE(false)
DRI_CONF_VK_WSI_FORCE_BGRA8_UNORM_FIRST(false)
DRI_CONF_VK_WSI_FORCE_SWAPCHAIN_TO_CURRENT_EXTENT(false)
DRI_CONF_VK_X11_IGNORE_SUBOPTIMAL(false)
DRI_CONF_LIMIT_TRIG_INPUT_RANGE(false)
DRI_CONF_ANV_MESH_CONV_PRIM_ATTRS_TO_VERT_ATTRS(-2)
DRI_CONF_FORCE_VK_VENDOR(0)
DRI_CONF_FAKE_SPARSE(false)
#if DETECT_OS_ANDROID && ANDROID_API_LEVEL >= 34
DRI_CONF_VK_REQUIRE_ASTC(true)
#else
DRI_CONF_VK_REQUIRE_ASTC(false)
#endif
DRI_CONF_SECTION_END
DRI_CONF_SECTION_QUALITY
DRI_CONF_PP_LOWER_DEPTH_RANGE_RATE()
DRI_CONF_SECTION_END
};
/* This is probably far to big but it reflects the max size used for messages
* in OpenGLs KHR_debug.
*/
#define MAX_DEBUG_MESSAGE_LENGTH 4096
/* The "RAW" clocks on Linux are called "FAST" on FreeBSD */
#if !defined(CLOCK_MONOTONIC_RAW) && defined(CLOCK_MONOTONIC_FAST)
#define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC_FAST
#endif
static void
compiler_debug_log(void *data, UNUSED unsigned *id, const char *fmt, ...)
{
char str[MAX_DEBUG_MESSAGE_LENGTH];
struct anv_device *device = (struct anv_device *)data;
UNUSED struct anv_instance *instance = device->physical->instance;
va_list args;
va_start(args, fmt);
(void) vsnprintf(str, MAX_DEBUG_MESSAGE_LENGTH, fmt, args);
va_end(args);
//vk_logd(VK_LOG_NO_OBJS(&instance->vk), "%s", str);
}
static void
compiler_perf_log(UNUSED void *data, UNUSED unsigned *id, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
if (INTEL_DEBUG(DEBUG_PERF))
mesa_logd_v(fmt, args);
va_end(args);
}
#if defined(VK_USE_PLATFORM_WAYLAND_KHR) || \
defined(VK_USE_PLATFORM_XCB_KHR) || \
defined(VK_USE_PLATFORM_XLIB_KHR) || \
defined(VK_USE_PLATFORM_DISPLAY_KHR)
#define ANV_USE_WSI_PLATFORM
#endif
#ifdef ANDROID_STRICT
#if ANDROID_API_LEVEL >= 33
#define ANV_API_VERSION VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION)
#else
#define ANV_API_VERSION VK_MAKE_VERSION(1, 1, VK_HEADER_VERSION)
#endif
#else
#define ANV_API_VERSION VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION)
#endif
VkResult anv_EnumerateInstanceVersion(
uint32_t* pApiVersion)
{
*pApiVersion = ANV_API_VERSION;
return VK_SUCCESS;
}
static const struct vk_instance_extension_table instance_extensions = {
.KHR_device_group_creation = true,
.KHR_external_fence_capabilities = true,
.KHR_external_memory_capabilities = true,
.KHR_external_semaphore_capabilities = true,
.KHR_get_physical_device_properties2 = true,
.EXT_debug_report = true,
.EXT_debug_utils = true,
#ifdef ANV_USE_WSI_PLATFORM
.KHR_get_surface_capabilities2 = true,
.KHR_surface = true,
.KHR_surface_protected_capabilities = true,
.EXT_surface_maintenance1 = true,
.EXT_swapchain_colorspace = true,
#endif
#ifdef VK_USE_PLATFORM_WAYLAND_KHR
.KHR_wayland_surface = true,
#endif
#ifdef VK_USE_PLATFORM_XCB_KHR
.KHR_xcb_surface = true,
#endif
#ifdef VK_USE_PLATFORM_XLIB_KHR
.KHR_xlib_surface = true,
#endif
#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
.EXT_acquire_xlib_display = true,
#endif
#ifdef VK_USE_PLATFORM_DISPLAY_KHR
.KHR_display = true,
.KHR_get_display_properties2 = true,
.EXT_direct_mode_display = true,
.EXT_display_surface_counter = true,
.EXT_acquire_drm_display = true,
#endif
#ifndef VK_USE_PLATFORM_WIN32_KHR
.EXT_headless_surface = true,
#endif
};
static void
get_device_extensions(const struct anv_physical_device *device,
struct vk_device_extension_table *ext)
{
const bool has_syncobj_wait =
(device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT) != 0;
const bool rt_enabled = ANV_SUPPORT_RT && device->info.has_ray_tracing;
*ext = (struct vk_device_extension_table) {
.KHR_8bit_storage = true,
.KHR_16bit_storage = !device->instance->no_16bit,
.KHR_acceleration_structure = rt_enabled,
.KHR_bind_memory2 = true,
.KHR_buffer_device_address = true,
.KHR_calibrated_timestamps = device->has_reg_timestamp,
.KHR_copy_commands2 = true,
.KHR_cooperative_matrix = anv_has_cooperative_matrix(device),
.KHR_create_renderpass2 = true,
.KHR_dedicated_allocation = true,
.KHR_deferred_host_operations = true,
.KHR_depth_stencil_resolve = true,
.KHR_descriptor_update_template = true,
.KHR_device_group = true,
.KHR_draw_indirect_count = true,
.KHR_driver_properties = true,
.KHR_dynamic_rendering = true,
.KHR_external_fence = has_syncobj_wait,
.KHR_external_fence_fd = has_syncobj_wait,
.KHR_external_memory = true,
.KHR_external_memory_fd = true,
.KHR_external_semaphore = true,
.KHR_external_semaphore_fd = true,
.KHR_format_feature_flags2 = true,
.KHR_fragment_shading_rate = device->info.ver >= 11,
.KHR_get_memory_requirements2 = true,
.KHR_global_priority = device->max_context_priority >=
VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
.KHR_image_format_list = true,
.KHR_imageless_framebuffer = true,
#ifdef ANV_USE_WSI_PLATFORM
.KHR_incremental_present = true,
#endif
.KHR_index_type_uint8 = true,
.KHR_line_rasterization = true,
.KHR_load_store_op_none = true,
.KHR_maintenance1 = true,
.KHR_maintenance2 = true,
.KHR_maintenance3 = true,
.KHR_maintenance4 = true,
.KHR_maintenance5 = true,
.KHR_maintenance6 = true,
.KHR_map_memory2 = true,
.KHR_multiview = true,
.KHR_performance_query =
device->perf &&
(device->perf->i915_perf_version >= 3 ||
INTEL_DEBUG(DEBUG_NO_OACONFIG)) &&
device->use_call_secondary,
.KHR_pipeline_executable_properties = true,
.KHR_pipeline_library = true,
/* Hide these behind dri configs for now since we cannot implement it reliably on
* all surfaces yet. There is no surface capability query for present wait/id,
* but the feature is useful enough to hide behind an opt-in mechanism for now.
* If the instance only enables surface extensions that unconditionally support present wait,
* we can also expose the extension that way. */
.KHR_present_id =
driQueryOptionb(&device->instance->dri_options, "vk_khr_present_wait") ||
wsi_common_vk_instance_supports_present_wait(&device->instance->vk),
.KHR_present_wait =
driQueryOptionb(&device->instance->dri_options, "vk_khr_present_wait") ||
wsi_common_vk_instance_supports_present_wait(&device->instance->vk),
.KHR_push_descriptor = true,
.KHR_ray_query = rt_enabled,
.KHR_ray_tracing_maintenance1 = rt_enabled,
.KHR_ray_tracing_pipeline = rt_enabled,
.KHR_ray_tracing_position_fetch = rt_enabled,
.KHR_relaxed_block_layout = true,
.KHR_sampler_mirror_clamp_to_edge = true,
.KHR_sampler_ycbcr_conversion = true,
.KHR_separate_depth_stencil_layouts = true,
.KHR_shader_atomic_int64 = true,
.KHR_shader_clock = true,
.KHR_shader_draw_parameters = true,
.KHR_shader_expect_assume = true,
.KHR_shader_float16_int8 = !device->instance->no_16bit,
.KHR_shader_float_controls = true,
.KHR_shader_float_controls2 = true,
.KHR_shader_integer_dot_product = true,
.KHR_shader_maximal_reconvergence = true,
.KHR_shader_non_semantic_info = true,
.KHR_shader_quad_control = true,
.KHR_shader_subgroup_extended_types = true,
.KHR_shader_subgroup_rotate = true,
.KHR_shader_subgroup_uniform_control_flow = true,
.KHR_shader_terminate_invocation = true,
.KHR_spirv_1_4 = true,
.KHR_storage_buffer_storage_class = true,
#ifdef ANV_USE_WSI_PLATFORM
.KHR_swapchain = true,
.KHR_swapchain_mutable_format = true,
#endif
.KHR_synchronization2 = true,
.KHR_timeline_semaphore = true,
.KHR_uniform_buffer_standard_layout = true,
.KHR_variable_pointers = true,
.KHR_vertex_attribute_divisor = true,
.KHR_video_queue = device->video_decode_enabled,
.KHR_video_decode_queue = device->video_decode_enabled,
.KHR_video_decode_h264 = VIDEO_CODEC_H264DEC && device->video_decode_enabled,
.KHR_video_decode_h265 = VIDEO_CODEC_H265DEC && device->video_decode_enabled,
.KHR_vulkan_memory_model = true,
.KHR_workgroup_memory_explicit_layout = true,
.KHR_zero_initialize_workgroup_memory = true,
.EXT_4444_formats = true,
.EXT_attachment_feedback_loop_layout = true,
.EXT_attachment_feedback_loop_dynamic_state = true,
.EXT_border_color_swizzle = true,
.EXT_buffer_device_address = true,
.EXT_calibrated_timestamps = device->has_reg_timestamp,
.EXT_color_write_enable = true,
.EXT_conditional_rendering = true,
.EXT_conservative_rasterization = true,
.EXT_custom_border_color = true,
.EXT_depth_bias_control = true,
.EXT_depth_clamp_zero_one = true,
.EXT_depth_clip_control = true,
.EXT_depth_range_unrestricted = device->info.ver >= 20,
.EXT_depth_clip_enable = true,
.EXT_descriptor_buffer = true,
.EXT_descriptor_indexing = true,
#ifdef VK_USE_PLATFORM_DISPLAY_KHR
.EXT_display_control = true,
#endif
.EXT_dynamic_rendering_unused_attachments = true,
.EXT_extended_dynamic_state = true,
.EXT_extended_dynamic_state2 = true,
.EXT_extended_dynamic_state3 = true,
.EXT_external_memory_dma_buf = true,
.EXT_external_memory_host = true,
.EXT_fragment_shader_interlock = true,
.EXT_global_priority = device->max_context_priority >=
VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
.EXT_global_priority_query = device->max_context_priority >=
VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
.EXT_graphics_pipeline_library = !debug_get_bool_option("ANV_NO_GPL", false),
.EXT_host_query_reset = true,
.EXT_image_2d_view_of_3d = true,
.EXT_image_compression_control = device->instance->compression_control_enabled,
.EXT_image_robustness = true,
.EXT_image_drm_format_modifier = true,
.EXT_image_sliced_view_of_3d = true,
.EXT_image_view_min_lod = true,
.EXT_index_type_uint8 = true,
.EXT_inline_uniform_block = true,
.EXT_legacy_dithering = true,
.EXT_legacy_vertex_attributes = true,
.EXT_line_rasterization = true,
.EXT_load_store_op_none = true,
.EXT_map_memory_placed = device->info.has_mmap_offset,
/* Enable the extension only if we have support on both the local &
* system memory
*/
.EXT_memory_budget = (!device->info.has_local_mem ||
device->vram_mappable.available > 0) &&
device->sys.available,
.EXT_mesh_shader = device->info.has_mesh_shading,
.EXT_mutable_descriptor_type = true,
.EXT_nested_command_buffer = true,
.EXT_non_seamless_cube_map = true,
.EXT_pci_bus_info = true,
.EXT_physical_device_drm = true,
.EXT_pipeline_creation_cache_control = true,
.EXT_pipeline_creation_feedback = true,
.EXT_pipeline_library_group_handles = rt_enabled,
.EXT_pipeline_robustness = true,
.EXT_post_depth_coverage = true,
.EXT_primitives_generated_query = true,
.EXT_primitive_topology_list_restart = true,
.EXT_private_data = true,
.EXT_provoking_vertex = true,
.EXT_queue_family_foreign = true,
.EXT_robustness2 = true,
.EXT_sample_locations = true,
.EXT_sampler_filter_minmax = true,
.EXT_scalar_block_layout = true,
.EXT_separate_stencil_usage = true,
.EXT_shader_atomic_float = true,
.EXT_shader_atomic_float2 = true,
.EXT_shader_demote_to_helper_invocation = true,
.EXT_shader_module_identifier = true,
.EXT_shader_stencil_export = true,
.EXT_shader_subgroup_ballot = true,
.EXT_shader_subgroup_vote = true,
.EXT_shader_viewport_index_layer = true,
.EXT_subgroup_size_control = true,
#ifdef ANV_USE_WSI_PLATFORM
.EXT_swapchain_maintenance1 = true,
#endif
.EXT_texel_buffer_alignment = true,
.EXT_tooling_info = true,
.EXT_transform_feedback = true,
.EXT_vertex_attribute_divisor = true,
.EXT_vertex_input_dynamic_state = true,
.EXT_ycbcr_image_arrays = true,
.AMD_buffer_marker = true,
.AMD_texture_gather_bias_lod = device->info.ver >= 20,
#if DETECT_OS_ANDROID
.ANDROID_external_memory_android_hardware_buffer = true,
.ANDROID_native_buffer = true,
#endif
.GOOGLE_decorate_string = true,
.GOOGLE_hlsl_functionality1 = true,
.GOOGLE_user_type = true,
.INTEL_performance_query = device->perf &&
device->perf->i915_perf_version >= 3,
.INTEL_shader_integer_functions2 = true,
.EXT_multi_draw = true,
.NV_compute_shader_derivatives = true,
.VALVE_mutable_descriptor_type = true,
};
}
static void
get_features(const struct anv_physical_device *pdevice,
struct vk_features *features)
{
struct vk_app_info *app_info = &pdevice->instance->vk.app_info;
const bool rt_enabled = ANV_SUPPORT_RT && pdevice->info.has_ray_tracing;
const bool mesh_shader =
pdevice->vk.supported_extensions.EXT_mesh_shader;
const bool has_sparse_or_fake = pdevice->sparse_type != ANV_SPARSE_TYPE_NOT_SUPPORTED;
*features = (struct vk_features) {
/* Vulkan 1.0 */
.robustBufferAccess = true,
.fullDrawIndexUint32 = true,
.imageCubeArray = true,
.independentBlend = true,
.geometryShader = true,
.tessellationShader = true,
.sampleRateShading = true,
.dualSrcBlend = true,
.logicOp = true,
.multiDrawIndirect = true,
.drawIndirectFirstInstance = true,
.depthClamp = true,
.depthBiasClamp = true,
.fillModeNonSolid = true,
.depthBounds = pdevice->info.ver >= 12,
.wideLines = true,
.largePoints = true,
.alphaToOne = true,
.multiViewport = true,
.samplerAnisotropy = true,
.textureCompressionETC2 = true,
.textureCompressionASTC_LDR = pdevice->has_astc_ldr ||
pdevice->emu_astc_ldr,
.textureCompressionBC = true,
.occlusionQueryPrecise = true,
.pipelineStatisticsQuery = true,
.vertexPipelineStoresAndAtomics = true,
.fragmentStoresAndAtomics = true,
.shaderTessellationAndGeometryPointSize = true,
.shaderImageGatherExtended = true,
.shaderStorageImageExtendedFormats = true,
.shaderStorageImageMultisample = false,
/* Gfx12.5 has all the required format supported in HW for typed
* read/writes
*/
.shaderStorageImageReadWithoutFormat = pdevice->info.verx10 >= 125,
.shaderStorageImageWriteWithoutFormat = true,
.shaderUniformBufferArrayDynamicIndexing = true,
.shaderSampledImageArrayDynamicIndexing = true,
.shaderStorageBufferArrayDynamicIndexing = true,
.shaderStorageImageArrayDynamicIndexing = true,
.shaderClipDistance = true,
.shaderCullDistance = true,
.shaderFloat64 = pdevice->info.has_64bit_float ||
pdevice->instance->fp64_workaround_enabled,
.shaderInt64 = true,
.shaderInt16 = true,
.shaderResourceMinLod = true,
.shaderResourceResidency = has_sparse_or_fake,
.sparseBinding = has_sparse_or_fake,
.sparseResidencyAliased = has_sparse_or_fake,
.sparseResidencyBuffer = has_sparse_or_fake,
.sparseResidencyImage2D = has_sparse_or_fake,
.sparseResidencyImage3D = has_sparse_or_fake,
.sparseResidency2Samples = has_sparse_or_fake,
.sparseResidency4Samples = has_sparse_or_fake,
.sparseResidency8Samples = has_sparse_or_fake &&
pdevice->info.verx10 != 125,
.sparseResidency16Samples = has_sparse_or_fake &&
pdevice->info.verx10 != 125,
.variableMultisampleRate = true,
.inheritedQueries = true,
/* Vulkan 1.1 */
.storageBuffer16BitAccess = !pdevice->instance->no_16bit,
.uniformAndStorageBuffer16BitAccess = !pdevice->instance->no_16bit,
.storagePushConstant16 = true,
.storageInputOutput16 = false,
.multiview = true,
.multiviewGeometryShader = true,
.multiviewTessellationShader = true,
.variablePointersStorageBuffer = true,
.variablePointers = true,
.protectedMemory = pdevice->has_protected_contexts,
.samplerYcbcrConversion = true,
.shaderDrawParameters = true,
/* Vulkan 1.2 */
.samplerMirrorClampToEdge = true,
.drawIndirectCount = true,
.storageBuffer8BitAccess = true,
.uniformAndStorageBuffer8BitAccess = true,
.storagePushConstant8 = true,
.shaderBufferInt64Atomics = true,
.shaderSharedInt64Atomics = false,
.shaderFloat16 = !pdevice->instance->no_16bit,
.shaderInt8 = !pdevice->instance->no_16bit,
.descriptorIndexing = true,
.shaderInputAttachmentArrayDynamicIndexing = false,
.shaderUniformTexelBufferArrayDynamicIndexing = true,
.shaderStorageTexelBufferArrayDynamicIndexing = true,
.shaderUniformBufferArrayNonUniformIndexing = true,
.shaderSampledImageArrayNonUniformIndexing = true,
.shaderStorageBufferArrayNonUniformIndexing = true,
.shaderStorageImageArrayNonUniformIndexing = true,
.shaderInputAttachmentArrayNonUniformIndexing = false,
.shaderUniformTexelBufferArrayNonUniformIndexing = true,
.shaderStorageTexelBufferArrayNonUniformIndexing = true,
.descriptorBindingUniformBufferUpdateAfterBind = true,
.descriptorBindingSampledImageUpdateAfterBind = true,
.descriptorBindingStorageImageUpdateAfterBind = true,
.descriptorBindingStorageBufferUpdateAfterBind = true,
.descriptorBindingUniformTexelBufferUpdateAfterBind = true,
.descriptorBindingStorageTexelBufferUpdateAfterBind = true,
.descriptorBindingUpdateUnusedWhilePending = true,
.descriptorBindingPartiallyBound = true,
.descriptorBindingVariableDescriptorCount = true,
.runtimeDescriptorArray = true,
.samplerFilterMinmax = true,
.scalarBlockLayout = true,
.imagelessFramebuffer = true,
.uniformBufferStandardLayout = true,
.shaderSubgroupExtendedTypes = true,
.separateDepthStencilLayouts = true,
.hostQueryReset = true,
.timelineSemaphore = true,
.bufferDeviceAddress = true,
.bufferDeviceAddressCaptureReplay = true,
.bufferDeviceAddressMultiDevice = false,
.vulkanMemoryModel = true,
.vulkanMemoryModelDeviceScope = true,
.vulkanMemoryModelAvailabilityVisibilityChains = true,
.shaderOutputViewportIndex = true,
.shaderOutputLayer = true,
.subgroupBroadcastDynamicId = true,
/* Vulkan 1.3 */
.robustImageAccess = true,
.inlineUniformBlock = true,
.descriptorBindingInlineUniformBlockUpdateAfterBind = true,
.pipelineCreationCacheControl = true,
.privateData = true,
.shaderDemoteToHelperInvocation = true,
.shaderTerminateInvocation = true,
.subgroupSizeControl = true,
.computeFullSubgroups = true,
.synchronization2 = true,
.textureCompressionASTC_HDR = false,
.shaderZeroInitializeWorkgroupMemory = true,
.dynamicRendering = true,
.shaderIntegerDotProduct = true,
.maintenance4 = true,
/* VK_EXT_4444_formats */
.formatA4R4G4B4 = true,
.formatA4B4G4R4 = false,
/* VK_KHR_acceleration_structure */
.accelerationStructure = rt_enabled,
.accelerationStructureCaptureReplay = false, /* TODO */
.accelerationStructureIndirectBuild = false, /* TODO */
.accelerationStructureHostCommands = false,
.descriptorBindingAccelerationStructureUpdateAfterBind = rt_enabled,
/* VK_EXT_border_color_swizzle */
.borderColorSwizzle = true,
.borderColorSwizzleFromImage = true,
/* VK_EXT_color_write_enable */
.colorWriteEnable = true,
/* VK_EXT_image_2d_view_of_3d */
.image2DViewOf3D = true,
.sampler2DViewOf3D = true,
/* VK_EXT_image_sliced_view_of_3d */
.imageSlicedViewOf3D = true,
/* VK_NV_compute_shader_derivatives */
.computeDerivativeGroupQuads = true,
.computeDerivativeGroupLinear = true,
/* VK_EXT_conditional_rendering */
.conditionalRendering = true,
.inheritedConditionalRendering = true,
/* VK_EXT_custom_border_color */
.customBorderColors = true,
.customBorderColorWithoutFormat = true,
/* VK_EXT_depth_clamp_zero_one */
.depthClampZeroOne = true,
/* VK_EXT_depth_clip_enable */
.depthClipEnable = true,
/* VK_EXT_fragment_shader_interlock */
.fragmentShaderSampleInterlock = true,
.fragmentShaderPixelInterlock = true,
.fragmentShaderShadingRateInterlock = false,
/* VK_EXT_global_priority_query */
.globalPriorityQuery = true,
/* VK_EXT_graphics_pipeline_library */
.graphicsPipelineLibrary =
pdevice->vk.supported_extensions.EXT_graphics_pipeline_library,
/* VK_KHR_fragment_shading_rate */
.pipelineFragmentShadingRate = true,
.primitiveFragmentShadingRate =
pdevice->info.has_coarse_pixel_primitive_and_cb,
.attachmentFragmentShadingRate =
pdevice->info.has_coarse_pixel_primitive_and_cb,
/* VK_EXT_image_view_min_lod */
.minLod = true,
/* VK_EXT_index_type_uint8 */
.indexTypeUint8 = true,
/* VK_EXT_line_rasterization */
/* Rectangular lines must use the strict algorithm, which is not
* supported for wide lines prior to ICL. See rasterization_mode for
* details and how the HW states are programmed.
*/
.rectangularLines = pdevice->info.ver >= 10,
.bresenhamLines = true,
/* Support for Smooth lines with MSAA was removed on gfx11. From the
* BSpec section "Multisample ModesState" table for "AA Line Support
* Requirements":
*
* GFX10:BUG:######## NUM_MULTISAMPLES == 1
*
* Fortunately, this isn't a case most people care about.
*/
.smoothLines = pdevice->info.ver < 10,
.stippledRectangularLines = false,
.stippledBresenhamLines = true,
.stippledSmoothLines = false,
/* VK_NV_mesh_shader */
.taskShaderNV = false,
.meshShaderNV = false,
/* VK_EXT_mesh_shader */
.taskShader = mesh_shader,
.meshShader = mesh_shader,
.multiviewMeshShader = false,
.primitiveFragmentShadingRateMeshShader = mesh_shader,
.meshShaderQueries = false,
/* VK_EXT_mutable_descriptor_type */
.mutableDescriptorType = true,
/* VK_KHR_performance_query */
.performanceCounterQueryPools = true,
/* HW only supports a single configuration at a time. */
.performanceCounterMultipleQueryPools = false,
/* VK_KHR_pipeline_executable_properties */
.pipelineExecutableInfo = true,
/* VK_EXT_primitives_generated_query */
.primitivesGeneratedQuery = true,
.primitivesGeneratedQueryWithRasterizerDiscard = false,
.primitivesGeneratedQueryWithNonZeroStreams = false,
/* VK_EXT_pipeline_library_group_handles */
.pipelineLibraryGroupHandles = true,
/* VK_EXT_provoking_vertex */
.provokingVertexLast = true,
.transformFeedbackPreservesProvokingVertex = true,
/* VK_KHR_ray_query */
.rayQuery = rt_enabled,
/* VK_KHR_ray_tracing_maintenance1 */
.rayTracingMaintenance1 = rt_enabled,
.rayTracingPipelineTraceRaysIndirect2 = rt_enabled,
/* VK_KHR_ray_tracing_pipeline */
.rayTracingPipeline = rt_enabled,
.rayTracingPipelineShaderGroupHandleCaptureReplay = false,
.rayTracingPipelineShaderGroupHandleCaptureReplayMixed = false,
.rayTracingPipelineTraceRaysIndirect = rt_enabled,
.rayTraversalPrimitiveCulling = rt_enabled,
/* VK_EXT_robustness2 */
.robustBufferAccess2 = true,
.robustImageAccess2 = true,
.nullDescriptor = true,
/* VK_EXT_shader_atomic_float */
.shaderBufferFloat32Atomics = true,
.shaderBufferFloat32AtomicAdd = pdevice->info.has_lsc,
.shaderBufferFloat64Atomics =
pdevice->info.has_64bit_float && pdevice->info.has_lsc,
.shaderBufferFloat64AtomicAdd = pdevice->info.ver >= 20,
.shaderSharedFloat32Atomics = true,
.shaderSharedFloat32AtomicAdd = false,
.shaderSharedFloat64Atomics = false,
.shaderSharedFloat64AtomicAdd = false,
.shaderImageFloat32Atomics = true,
.shaderImageFloat32AtomicAdd = pdevice->info.ver >= 20,
.sparseImageFloat32Atomics = false,
.sparseImageFloat32AtomicAdd = false,
/* VK_EXT_shader_atomic_float2 */
.shaderBufferFloat16Atomics = pdevice->info.has_lsc,
.shaderBufferFloat16AtomicAdd = false,
.shaderBufferFloat16AtomicMinMax = pdevice->info.has_lsc,
.shaderBufferFloat32AtomicMinMax = true,
.shaderBufferFloat64AtomicMinMax =
pdevice->info.has_64bit_float && pdevice->info.has_lsc &&
pdevice->info.ver < 20,
.shaderSharedFloat16Atomics = pdevice->info.has_lsc,
.shaderSharedFloat16AtomicAdd = false,
.shaderSharedFloat16AtomicMinMax = pdevice->info.has_lsc,
.shaderSharedFloat32AtomicMinMax = true,
.shaderSharedFloat64AtomicMinMax = false,
.shaderImageFloat32AtomicMinMax = false,
.sparseImageFloat32AtomicMinMax = false,
/* VK_KHR_shader_clock */
.shaderSubgroupClock = true,
.shaderDeviceClock = false,
/* VK_INTEL_shader_integer_functions2 */
.shaderIntegerFunctions2 = true,
/* VK_EXT_shader_module_identifier */
.shaderModuleIdentifier = true,
/* VK_KHR_shader_subgroup_uniform_control_flow */
.shaderSubgroupUniformControlFlow = true,
/* VK_EXT_texel_buffer_alignment */
.texelBufferAlignment = true,
/* VK_EXT_transform_feedback */
.transformFeedback = true,
.geometryStreams = true,
/* VK_KHR_vertex_attribute_divisor */
.vertexAttributeInstanceRateDivisor = true,
.vertexAttributeInstanceRateZeroDivisor = true,
/* VK_KHR_workgroup_memory_explicit_layout */
.workgroupMemoryExplicitLayout = true,
.workgroupMemoryExplicitLayoutScalarBlockLayout = true,
.workgroupMemoryExplicitLayout8BitAccess = true,
.workgroupMemoryExplicitLayout16BitAccess = true,
/* VK_EXT_ycbcr_image_arrays */
.ycbcrImageArrays = true,
/* VK_EXT_extended_dynamic_state */
.extendedDynamicState = true,
/* VK_EXT_extended_dynamic_state2 */
.extendedDynamicState2 = true,
.extendedDynamicState2LogicOp = true,
.extendedDynamicState2PatchControlPoints = true,
/* VK_EXT_extended_dynamic_state3 */
.extendedDynamicState3PolygonMode = true,
.extendedDynamicState3TessellationDomainOrigin = true,
.extendedDynamicState3RasterizationStream = true,
.extendedDynamicState3LineStippleEnable = true,
.extendedDynamicState3LineRasterizationMode = true,
.extendedDynamicState3LogicOpEnable = true,
.extendedDynamicState3AlphaToOneEnable = true,
.extendedDynamicState3DepthClipEnable = true,
.extendedDynamicState3DepthClampEnable = true,
.extendedDynamicState3DepthClipNegativeOneToOne = true,
.extendedDynamicState3ProvokingVertexMode = true,
.extendedDynamicState3ColorBlendEnable = true,
.extendedDynamicState3ColorWriteMask = true,
.extendedDynamicState3ColorBlendEquation = true,
.extendedDynamicState3SampleLocationsEnable = true,
.extendedDynamicState3SampleMask = true,
.extendedDynamicState3ConservativeRasterizationMode = true,
.extendedDynamicState3AlphaToCoverageEnable = true,
.extendedDynamicState3RasterizationSamples = true,
.extendedDynamicState3ExtraPrimitiveOverestimationSize = false,
.extendedDynamicState3ViewportWScalingEnable = false,
.extendedDynamicState3ViewportSwizzle = false,
.extendedDynamicState3ShadingRateImageEnable = false,
.extendedDynamicState3CoverageToColorEnable = false,
.extendedDynamicState3CoverageToColorLocation = false,
.extendedDynamicState3CoverageModulationMode = false,
.extendedDynamicState3CoverageModulationTableEnable = false,
.extendedDynamicState3CoverageModulationTable = false,
.extendedDynamicState3CoverageReductionMode = false,
.extendedDynamicState3RepresentativeFragmentTestEnable = false,
.extendedDynamicState3ColorBlendAdvanced = false,
/* VK_EXT_multi_draw */
.multiDraw = true,
/* VK_EXT_non_seamless_cube_map */
.nonSeamlessCubeMap = true,
/* VK_EXT_primitive_topology_list_restart */
.primitiveTopologyListRestart = true,
.primitiveTopologyPatchListRestart = true,
/* VK_EXT_depth_clip_control */
.depthClipControl = true,
/* VK_KHR_present_id */
.presentId = pdevice->vk.supported_extensions.KHR_present_id,
/* VK_KHR_present_wait */
.presentWait = pdevice->vk.supported_extensions.KHR_present_wait,
/* VK_EXT_vertex_input_dynamic_state */
.vertexInputDynamicState = true,
/* VK_KHR_ray_tracing_position_fetch */
.rayTracingPositionFetch = rt_enabled,
/* VK_EXT_dynamic_rendering_unused_attachments */
.dynamicRenderingUnusedAttachments = true,
/* VK_EXT_depth_bias_control */
.depthBiasControl = true,
.floatRepresentation = true,
.leastRepresentableValueForceUnormRepresentation = false,
.depthBiasExact = true,
/* VK_EXT_pipeline_robustness */
.pipelineRobustness = true,
/* VK_KHR_maintenance5 */
.maintenance5 = true,
/* VK_KHR_maintenance6 */
.maintenance6 = true,
/* VK_EXT_nested_command_buffer */
.nestedCommandBuffer = true,
.nestedCommandBufferRendering = true,
.nestedCommandBufferSimultaneousUse = false,
/* VK_KHR_cooperative_matrix */
.cooperativeMatrix = anv_has_cooperative_matrix(pdevice),
/* VK_KHR_shader_maximal_reconvergence */
.shaderMaximalReconvergence = true,
/* VK_KHR_shader_subgroup_rotate */
.shaderSubgroupRotate = true,
.shaderSubgroupRotateClustered = true,
/* VK_EXT_attachment_feedback_loop_layout */
.attachmentFeedbackLoopLayout = true,
/* VK_EXT_attachment_feedback_loop_dynamic_state */
.attachmentFeedbackLoopDynamicState = true,
/* VK_KHR_shader_expect_assume */
.shaderExpectAssume = true,
/* VK_EXT_descriptor_buffer */
.descriptorBuffer = true,
.descriptorBufferCaptureReplay = true,
.descriptorBufferImageLayoutIgnored = false,
.descriptorBufferPushDescriptors = true,
/* VK_EXT_map_memory_placed */
.memoryMapPlaced = true,
.memoryMapRangePlaced = false,
.memoryUnmapReserve = true,
/* VK_KHR_shader_quad_control */
.shaderQuadControl = true,
#ifdef ANV_USE_WSI_PLATFORM
/* VK_EXT_swapchain_maintenance1 */
.swapchainMaintenance1 = true,
#endif
/* VK_EXT_image_compression_control */
.imageCompressionControl = true,
/* VK_KHR_shader_float_controls2 */
.shaderFloatControls2 = true,
/* VK_EXT_legacy_vertex_attributes */
.legacyVertexAttributes = true,
/* VK_EXT_legacy_dithering */
.legacyDithering = true,
};
/* The new DOOM and Wolfenstein games require depthBounds without
* checking for it. They seem to run fine without it so just claim it's
* there and accept the consequences.
*/
if (app_info->engine_name && strcmp(app_info->engine_name, "idTech") == 0)
features->depthBounds = true;
}
#define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS 64
#define MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS 64
#define MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS 256
#define MAX_CUSTOM_BORDER_COLORS 4096
static VkDeviceSize
anx_get_physical_device_max_heap_size(const struct anv_physical_device *pdevice)
{
VkDeviceSize ret = 0;
for (uint32_t i = 0; i < pdevice->memory.heap_count; i++) {
if (pdevice->memory.heaps[i].size > ret)
ret = pdevice->memory.heaps[i].size;
}
return ret;
}
static void
get_properties_1_1(const struct anv_physical_device *pdevice,
struct vk_properties *p)
{
memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
memset(p->deviceLUID, 0, VK_LUID_SIZE);
p->deviceNodeMask = 0;
p->deviceLUIDValid = false;
p->subgroupSize = BRW_SUBGROUP_SIZE;
VkShaderStageFlags scalar_stages = 0;
for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
scalar_stages |= mesa_to_vk_shader_stage(stage);
}
if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) {
scalar_stages |= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
VK_SHADER_STAGE_MISS_BIT_KHR |
VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
VK_SHADER_STAGE_CALLABLE_BIT_KHR;
}
if (pdevice->vk.supported_extensions.EXT_mesh_shader) {
scalar_stages |= VK_SHADER_STAGE_TASK_BIT_EXT |
VK_SHADER_STAGE_MESH_BIT_EXT;
}
p->subgroupSupportedStages = scalar_stages;
p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
VK_SUBGROUP_FEATURE_VOTE_BIT |
VK_SUBGROUP_FEATURE_BALLOT_BIT |
VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
VK_SUBGROUP_FEATURE_QUAD_BIT |
VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR |
VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR;
p->subgroupQuadOperationsInAllStages = true;
p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY;
p->maxMultiviewViewCount = 16;
p->maxMultiviewInstanceIndex = UINT32_MAX / 16;
/* Our protected implementation is a memory encryption mechanism, it
* shouldn't page fault, but it hangs the HW so in terms of user visibility
* it's similar to a fault.
*/
p->protectedNoFault = false;
/* This value doesn't matter for us today as our per-stage descriptors are
* the real limit.
*/
p->maxPerSetDescriptors = 1024;
for (uint32_t i = 0; i < pdevice->memory.heap_count; i++) {
p->maxMemoryAllocationSize = MAX2(p->maxMemoryAllocationSize,
pdevice->memory.heaps[i].size);
}
}
static void
get_properties_1_2(const struct anv_physical_device *pdevice,
struct vk_properties *p)
{
p->driverID = VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA;
memset(p->driverName, 0, sizeof(p->driverName));
snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE,
"Intel open-source Mesa driver");
memset(p->driverInfo, 0, sizeof(p->driverInfo));
snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE,
"Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
p->conformanceVersion = (VkConformanceVersion) {
.major = 1,
.minor = 3,
.subminor = 6,
.patch = 0,
};
p->denormBehaviorIndependence =
VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
p->roundingModeIndependence =
VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE;
/* Broadwell does not support HF denorms and there are restrictions
* other gens. According to Kabylake's PRM:
*
* "math - Extended Math Function
* [...]
* Restriction : Half-float denorms are always retained."
*/
p->shaderDenormFlushToZeroFloat16 = false;
p->shaderDenormPreserveFloat16 = true;
p->shaderRoundingModeRTEFloat16 = true;
p->shaderRoundingModeRTZFloat16 = true;
p->shaderSignedZeroInfNanPreserveFloat16 = true;
p->shaderDenormFlushToZeroFloat32 = true;
p->shaderDenormPreserveFloat32 = true;
p->shaderRoundingModeRTEFloat32 = true;
p->shaderRoundingModeRTZFloat32 = true;
p->shaderSignedZeroInfNanPreserveFloat32 = true;
p->shaderDenormFlushToZeroFloat64 = true;
p->shaderDenormPreserveFloat64 = true;
p->shaderRoundingModeRTEFloat64 = true;
p->shaderRoundingModeRTZFloat64 = true;
p->shaderSignedZeroInfNanPreserveFloat64 = true;
/* It's a bit hard to exactly map our implementation to the limits
* described by Vulkan. The bindless surface handle in the extended
* message descriptors is 20 bits and it's an index into the table of
* RENDER_SURFACE_STATE structs that starts at bindless surface base
* address. This means that we can have at must 1M surface states
* allocated at any given time. Since most image views take two
* descriptors, this means we have a limit of about 500K image views.
*
* However, since we allocate surface states at vkCreateImageView time,
* this means our limit is actually something on the order of 500K image
* views allocated at any time. The actual limit describe by Vulkan, on
* the other hand, is a limit of how many you can have in a descriptor set.
* Assuming anyone using 1M descriptors will be using the same image view
* twice a bunch of times (or a bunch of null descriptors), we can safely
* advertise a larger limit here.
*/
const unsigned max_bindless_views =
anv_physical_device_bindless_heap_size(pdevice, false) / ANV_SURFACE_STATE_SIZE;
p->maxUpdateAfterBindDescriptorsInAllPools = max_bindless_views;
p->shaderUniformBufferArrayNonUniformIndexingNative = false;
p->shaderSampledImageArrayNonUniformIndexingNative = false;
p->shaderStorageBufferArrayNonUniformIndexingNative = true;
p->shaderStorageImageArrayNonUniformIndexingNative = false;
p->shaderInputAttachmentArrayNonUniformIndexingNative = false;
p->robustBufferAccessUpdateAfterBind = true;
p->quadDivergentImplicitLod = false;
p->maxPerStageDescriptorUpdateAfterBindSamplers = max_bindless_views;
p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX;
p->maxPerStageDescriptorUpdateAfterBindSampledImages = max_bindless_views;
p->maxPerStageDescriptorUpdateAfterBindStorageImages = max_bindless_views;
p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS;
p->maxPerStageUpdateAfterBindResources = UINT32_MAX;
p->maxDescriptorSetUpdateAfterBindSamplers = max_bindless_views;
p->maxDescriptorSetUpdateAfterBindUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
p->maxDescriptorSetUpdateAfterBindStorageBuffers = UINT32_MAX;
p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
p->maxDescriptorSetUpdateAfterBindSampledImages = max_bindless_views;
p->maxDescriptorSetUpdateAfterBindStorageImages = max_bindless_views;
p->maxDescriptorSetUpdateAfterBindInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS;
/* We support all of the depth resolve modes */
p->supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT |
VK_RESOLVE_MODE_AVERAGE_BIT |
VK_RESOLVE_MODE_MIN_BIT |
VK_RESOLVE_MODE_MAX_BIT;
/* Average doesn't make sense for stencil so we don't support that */
p->supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT |
VK_RESOLVE_MODE_MIN_BIT |
VK_RESOLVE_MODE_MAX_BIT;
p->independentResolveNone = true;
p->independentResolve = true;
p->filterMinmaxSingleComponentFormats = true;
p->filterMinmaxImageComponentMapping = true;
p->maxTimelineSemaphoreValueDifference = UINT64_MAX;
p->framebufferIntegerColorSampleCounts =
isl_device_get_sample_counts(&pdevice->isl_dev);
}
static void
get_properties_1_3(const struct anv_physical_device *pdevice,
struct vk_properties *p)
{
if (pdevice->info.ver >= 20)
p->minSubgroupSize = 16;
else
p->minSubgroupSize = 8;
p->maxSubgroupSize = 32;
p->maxComputeWorkgroupSubgroups = pdevice->info.max_cs_workgroup_threads;
p->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT |
VK_SHADER_STAGE_TASK_BIT_EXT |
VK_SHADER_STAGE_MESH_BIT_EXT;
p->maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE;
p->maxPerStageDescriptorInlineUniformBlocks =
MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks =
MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
p->maxDescriptorSetInlineUniformBlocks =
MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks =
MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
p->maxInlineUniformTotalSize = UINT16_MAX;
p->integerDotProduct8BitUnsignedAccelerated = false;
p->integerDotProduct8BitSignedAccelerated = false;
p->integerDotProduct8BitMixedSignednessAccelerated = false;
p->integerDotProduct4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
p->integerDotProduct4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
p->integerDotProduct4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
p->integerDotProduct16BitUnsignedAccelerated = false;
p->integerDotProduct16BitSignedAccelerated = false;
p->integerDotProduct16BitMixedSignednessAccelerated = false;
p->integerDotProduct32BitUnsignedAccelerated = false;
p->integerDotProduct32BitSignedAccelerated = false;
p->integerDotProduct32BitMixedSignednessAccelerated = false;
p->integerDotProduct64BitUnsignedAccelerated = false;
p->integerDotProduct64BitSignedAccelerated = false;
p->integerDotProduct64BitMixedSignednessAccelerated = false;
p->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false;
p->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false;
p->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false;
p->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
p->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
p->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
p->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false;
p->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false;
p->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false;
p->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false;
p->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false;
p->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false;
p->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false;
p->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false;
p->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false;
/* From the SKL PRM Vol. 2d, docs for RENDER_SURFACE_STATE::Surface
* Base Address:
*
* "For SURFTYPE_BUFFER non-rendertarget surfaces, this field
* specifies the base address of the first element of the surface,
* computed in software by adding the surface base address to the
* byte offset of the element in the buffer. The base address must
* be aligned to element size."
*
* The typed dataport messages require that things be texel aligned.
* Otherwise, we may just load/store the wrong data or, in the worst
* case, there may be hangs.
*/
p->storageTexelBufferOffsetAlignmentBytes = 16;
p->storageTexelBufferOffsetSingleTexelAlignment = true;
/* The sampler, however, is much more forgiving and it can handle
* arbitrary byte alignment for linear and buffer surfaces. It's
* hard to find a good PRM citation for this but years of empirical
* experience demonstrate that this is true.
*/
p->uniformTexelBufferOffsetAlignmentBytes = 1;
p->uniformTexelBufferOffsetSingleTexelAlignment = true;
p->maxBufferSize = pdevice->isl_dev.max_buffer_size;
}
static void
get_properties(const struct anv_physical_device *pdevice,
struct vk_properties *props)
{
const struct intel_device_info *devinfo = &pdevice->info;
const uint32_t max_ssbos = UINT16_MAX;
const uint32_t max_textures = UINT16_MAX;
const uint32_t max_samplers = UINT16_MAX;
const uint32_t max_images = UINT16_MAX;
const VkDeviceSize max_heap_size = anx_get_physical_device_max_heap_size(pdevice);
/* Claim a high per-stage limit since we have bindless. */
const uint32_t max_per_stage = UINT32_MAX;
const uint32_t max_workgroup_size =
MIN2(1024, 32 * devinfo->max_cs_workgroup_threads);
const bool has_sparse_or_fake = pdevice->sparse_type != ANV_SPARSE_TYPE_NOT_SUPPORTED;
const bool sparse_uses_trtt = pdevice->sparse_type == ANV_SPARSE_TYPE_TRTT;
uint64_t sparse_addr_space_size =
!has_sparse_or_fake ? 0 :
sparse_uses_trtt ? pdevice->va.trtt.size :
pdevice->va.high_heap.size;
VkSampleCountFlags sample_counts =
isl_device_get_sample_counts(&pdevice->isl_dev);
#if DETECT_OS_ANDROID
/* Used to fill struct VkPhysicalDevicePresentationPropertiesANDROID */
uint64_t front_rendering_usage = 0;
struct u_gralloc *gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
if (gralloc != NULL) {
u_gralloc_get_front_rendering_usage(gralloc, &front_rendering_usage);
u_gralloc_destroy(&gralloc);
}
#endif /* DETECT_OS_ANDROID */
*props = (struct vk_properties) {
.apiVersion = ANV_API_VERSION,
.driverVersion = vk_get_driver_version(),
.vendorID = pdevice->instance->force_vk_vendor != 0 ?
pdevice->instance->force_vk_vendor : 0x8086,
.deviceID = pdevice->info.pci_device_id,
.deviceType = pdevice->info.has_local_mem ?
VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU :
VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
/* Limits: */
.maxImageDimension1D = (1 << 14),
.maxImageDimension2D = (1 << 14),
.maxImageDimension3D = (1 << 11),
.maxImageDimensionCube = (1 << 14),
.maxImageArrayLayers = (1 << 11),
.maxTexelBufferElements = 128 * 1024 * 1024,
.maxUniformBufferRange = pdevice->compiler->indirect_ubos_use_sampler ? (1u << 27) : (1u << 30),
.maxStorageBufferRange = MIN3(pdevice->isl_dev.max_buffer_size, max_heap_size, UINT32_MAX),
.maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE,
.maxMemoryAllocationCount = UINT32_MAX,
.maxSamplerAllocationCount = 64 * 1024,
.bufferImageGranularity = 1,
.sparseAddressSpaceSize = sparse_addr_space_size,
.maxBoundDescriptorSets = MAX_SETS,
.maxPerStageDescriptorSamplers = max_samplers,
.maxPerStageDescriptorUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,
.maxPerStageDescriptorStorageBuffers = max_ssbos,
.maxPerStageDescriptorSampledImages = max_textures,
.maxPerStageDescriptorStorageImages = max_images,
.maxPerStageDescriptorInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS,
.maxPerStageResources = max_per_stage,
.maxDescriptorSetSamplers = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
.maxDescriptorSetUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS, /* number of stages * maxPerStageDescriptorUniformBuffers */
.maxDescriptorSetUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2,
.maxDescriptorSetStorageBuffers = 6 * max_ssbos, /* number of stages * maxPerStageDescriptorStorageBuffers */
.maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2,
.maxDescriptorSetSampledImages = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */
.maxDescriptorSetStorageImages = 6 * max_images, /* number of stages * maxPerStageDescriptorStorageImages */
.maxDescriptorSetInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS,
.maxVertexInputAttributes = MAX_VES,
.maxVertexInputBindings = MAX_VBS,
/* Broadwell PRMs: Volume 2d: Command Reference: Structures:
*
* VERTEX_ELEMENT_STATE::Source Element Offset: [0,2047]
*/
.maxVertexInputAttributeOffset = 2047,
/* Skylake PRMs: Volume 2d: Command Reference: Structures:
*
* VERTEX_BUFFER_STATE::Buffer Pitch: [0,4095]
*/
.maxVertexInputBindingStride = 4095,
.maxVertexOutputComponents = 128,
.maxTessellationGenerationLevel = 64,
.maxTessellationPatchSize = 32,
.maxTessellationControlPerVertexInputComponents = 128,
.maxTessellationControlPerVertexOutputComponents = 128,
.maxTessellationControlPerPatchOutputComponents = 128,
.maxTessellationControlTotalOutputComponents = 2048,
.maxTessellationEvaluationInputComponents = 128,
.maxTessellationEvaluationOutputComponents = 128,
.maxGeometryShaderInvocations = 32,
.maxGeometryInputComponents = 128,
.maxGeometryOutputComponents = 128,
.maxGeometryOutputVertices = 256,
.maxGeometryTotalOutputComponents = 1024,
.maxFragmentInputComponents = 116, /* 128 components - (PSIZ, CLIP_DIST0, CLIP_DIST1) */
.maxFragmentOutputAttachments = 8,
.maxFragmentDualSrcAttachments = 1,
.maxFragmentCombinedOutputResources = MAX_RTS + max_ssbos + max_images,
.maxComputeSharedMemorySize = 64 * 1024,
.maxComputeWorkGroupCount = { 65535, 65535, 65535 },
.maxComputeWorkGroupInvocations = max_workgroup_size,
.maxComputeWorkGroupSize = {
max_workgroup_size,
max_workgroup_size,
max_workgroup_size,
},
.subPixelPrecisionBits = 8,
.subTexelPrecisionBits = 8,
.mipmapPrecisionBits = 8,
.maxDrawIndexedIndexValue = UINT32_MAX,
.maxDrawIndirectCount = UINT32_MAX,
.maxSamplerLodBias = 16,
.maxSamplerAnisotropy = 16,
.maxViewports = MAX_VIEWPORTS,
.maxViewportDimensions = { (1 << 14), (1 << 14) },
.viewportBoundsRange = { INT16_MIN, INT16_MAX },
.viewportSubPixelBits = 13, /* We take a float? */
.minMemoryMapAlignment = 4096, /* A page */
/* The dataport requires texel alignment so we need to assume a worst
* case of R32G32B32A32 which is 16 bytes.
*/
.minTexelBufferOffsetAlignment = 16,
.minUniformBufferOffsetAlignment = ANV_UBO_ALIGNMENT,
.minStorageBufferOffsetAlignment = ANV_SSBO_ALIGNMENT,
.minTexelOffset = -8,
.maxTexelOffset = 7,
.minTexelGatherOffset = -32,
.maxTexelGatherOffset = 31,
.minInterpolationOffset = -0.5,
.maxInterpolationOffset = 0.4375,
.subPixelInterpolationOffsetBits = 4,
.maxFramebufferWidth = (1 << 14),
.maxFramebufferHeight = (1 << 14),
.maxFramebufferLayers = (1 << 11),
.framebufferColorSampleCounts = sample_counts,
.framebufferDepthSampleCounts = sample_counts,
.framebufferStencilSampleCounts = sample_counts,
.framebufferNoAttachmentsSampleCounts = sample_counts,
.maxColorAttachments = MAX_RTS,
.sampledImageColorSampleCounts = sample_counts,
.sampledImageIntegerSampleCounts = sample_counts,
.sampledImageDepthSampleCounts = sample_counts,
.sampledImageStencilSampleCounts = sample_counts,
.storageImageSampleCounts = VK_SAMPLE_COUNT_1_BIT,
.maxSampleMaskWords = 1,
.timestampComputeAndGraphics = true,
.timestampPeriod = 1000000000.0 / devinfo->timestamp_frequency,
.maxClipDistances = 8,
.maxCullDistances = 8,
.maxCombinedClipAndCullDistances = 8,
.discreteQueuePriorities = 2,
.pointSizeRange = { 0.125, 255.875 },
/* While SKL and up support much wider lines than we are setting here,
* in practice we run into conformance issues if we go past this limit.
* Since the Windows driver does the same, it's probably fair to assume
* that no one needs more than this.
*/
.lineWidthRange = { 0.0, 8.0 },
.pointSizeGranularity = (1.0 / 8.0),
.lineWidthGranularity = (1.0 / 128.0),
.strictLines = false,
.standardSampleLocations = true,
.optimalBufferCopyOffsetAlignment = 128,
.optimalBufferCopyRowPitchAlignment = 128,
.nonCoherentAtomSize = 64,
/* Sparse: */
.sparseResidencyStandard2DBlockShape = has_sparse_or_fake,
.sparseResidencyStandard2DMultisampleBlockShape = false,
.sparseResidencyStandard3DBlockShape = has_sparse_or_fake,
.sparseResidencyAlignedMipSize = false,
.sparseResidencyNonResidentStrict = has_sparse_or_fake,
/* VK_KHR_cooperative_matrix */
.cooperativeMatrixSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT,
};
snprintf(props->deviceName, sizeof(props->deviceName),
"%s", pdevice->info.name);
memcpy(props->pipelineCacheUUID,
pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
get_properties_1_1(pdevice, props);
get_properties_1_2(pdevice, props);
get_properties_1_3(pdevice, props);
/* VK_KHR_acceleration_structure */
{
props->maxGeometryCount = (1u << 24) - 1;
props->maxInstanceCount = (1u << 24) - 1;
props->maxPrimitiveCount = (1u << 29) - 1;
props->maxPerStageDescriptorAccelerationStructures = UINT16_MAX;
props->maxPerStageDescriptorUpdateAfterBindAccelerationStructures = UINT16_MAX;
props->maxDescriptorSetAccelerationStructures = UINT16_MAX;
props->maxDescriptorSetUpdateAfterBindAccelerationStructures = UINT16_MAX;
props->minAccelerationStructureScratchOffsetAlignment = 64;
}
/* VK_KHR_fragment_shading_rate */
{
props->primitiveFragmentShadingRateWithMultipleViewports =
pdevice->info.has_coarse_pixel_primitive_and_cb;
props->layeredShadingRateAttachments =
pdevice->info.has_coarse_pixel_primitive_and_cb;
props->fragmentShadingRateNonTrivialCombinerOps =
pdevice->info.has_coarse_pixel_primitive_and_cb;
props->maxFragmentSize = (VkExtent2D) { 4, 4 };
props->maxFragmentSizeAspectRatio =
pdevice->info.has_coarse_pixel_primitive_and_cb ?
2 : 4;
props->maxFragmentShadingRateCoverageSamples = 4 * 4 *
(pdevice->info.has_coarse_pixel_primitive_and_cb ? 4 : 16);
props->maxFragmentShadingRateRasterizationSamples =
pdevice->info.has_coarse_pixel_primitive_and_cb ?
VK_SAMPLE_COUNT_4_BIT : VK_SAMPLE_COUNT_16_BIT;
props->fragmentShadingRateWithShaderDepthStencilWrites = false;
props->fragmentShadingRateWithSampleMask = true;
props->fragmentShadingRateWithShaderSampleMask = false;
props->fragmentShadingRateWithConservativeRasterization = true;
props->fragmentShadingRateWithFragmentShaderInterlock = true;
props->fragmentShadingRateWithCustomSampleLocations = true;
props->fragmentShadingRateStrictMultiplyCombiner = true;
if (pdevice->info.has_coarse_pixel_primitive_and_cb) {
props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 8, 8 };
props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 8, 8 };
props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 1;
} else {
/* Those must be 0 if attachmentFragmentShadingRate is not supported. */
props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 0;
}
}
/* VK_KHR_maintenance5 */
{
props->earlyFragmentMultisampleCoverageAfterSampleCounting = false;
props->earlyFragmentSampleMaskTestBeforeSampleCounting = false;
props->depthStencilSwizzleOneSupport = true;
props->polygonModePointSize = true;
props->nonStrictSinglePixelWideLinesUseParallelogram = false;
props->nonStrictWideLinesUseParallelogram = false;
}
/* VK_KHR_maintenance6 */
{
props->blockTexelViewCompatibleMultipleLayers = true;
props->maxCombinedImageSamplerDescriptorCount = 3;
props->fragmentShadingRateClampCombinerInputs = true;
}
/* VK_KHR_performance_query */
{
props->allowCommandBufferQueryCopies = false;
}
/* VK_KHR_push_descriptor */
{
props->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;
}
/* VK_KHR_ray_tracing_pipeline */
{
/* TODO */
props->shaderGroupHandleSize = 32;
props->maxRayRecursionDepth = 31;
/* MemRay::hitGroupSRStride is 16 bits */
props->maxShaderGroupStride = UINT16_MAX;
/* MemRay::hitGroupSRBasePtr requires 16B alignment */
props->shaderGroupBaseAlignment = 16;
props->shaderGroupHandleAlignment = 16;
props->shaderGroupHandleCaptureReplaySize = 32;
props->maxRayDispatchInvocationCount = 1U << 30; /* required min limit */
props->maxRayHitAttributeSize = BRW_RT_SIZEOF_HIT_ATTRIB_DATA;
}
/* VK_KHR_vertex_attribute_divisor */
{
props->maxVertexAttribDivisor = UINT32_MAX / 16;
props->supportsNonZeroFirstInstance = true;
}
/* VK_EXT_conservative_rasterization */
{
/* There's nothing in the public docs about this value as far as I can
* tell. However, this is the value the Windows driver reports and
* there's a comment on a rejected HW feature in the internal docs that
* says:
*
* "This is similar to conservative rasterization, except the
* primitive area is not extended by 1/512 and..."
*
* That's a bit of an obtuse reference but it's the best we've got for
* now.
*/
props->primitiveOverestimationSize = 1.0f / 512.0f;
props->maxExtraPrimitiveOverestimationSize = 0.0f;
props->extraPrimitiveOverestimationSizeGranularity = 0.0f;
props->primitiveUnderestimation = false;
props->conservativePointAndLineRasterization = false;
props->degenerateTrianglesRasterized = true;
props->degenerateLinesRasterized = false;
props->fullyCoveredFragmentShaderInputVariable = false;
props->conservativeRasterizationPostDepthCoverage = true;
}
/* VK_EXT_custom_border_color */
{
props->maxCustomBorderColorSamplers = MAX_CUSTOM_BORDER_COLORS;
}
/* VK_EXT_descriptor_buffer */
{
props->combinedImageSamplerDescriptorSingleArray = true;
props->bufferlessPushDescriptors = true;
/* Written to the buffer before a timeline semaphore is signaled, but
* after vkQueueSubmit().
*/
props->allowSamplerImageViewPostSubmitCreation = true;
props->descriptorBufferOffsetAlignment = ANV_SURFACE_STATE_SIZE;
if (pdevice->uses_ex_bso) {
props->maxDescriptorBufferBindings = MAX_SETS;
props->maxResourceDescriptorBufferBindings = MAX_SETS;
props->maxSamplerDescriptorBufferBindings = MAX_SETS;
props->maxEmbeddedImmutableSamplerBindings = MAX_SETS;
} else {
props->maxDescriptorBufferBindings = 3; /* resources, samplers, push (we don't care about push) */
props->maxResourceDescriptorBufferBindings = 1;
props->maxSamplerDescriptorBufferBindings = 1;
props->maxEmbeddedImmutableSamplerBindings = 1;
}
props->maxEmbeddedImmutableSamplers = MAX_EMBEDDED_SAMPLERS;
/* Storing a 64bit address */
props->bufferCaptureReplayDescriptorDataSize = 8;
props->imageCaptureReplayDescriptorDataSize = 8;
/* Offset inside the reserved border color pool */
props->samplerCaptureReplayDescriptorDataSize = 4;
/* Not affected by replay */
props->imageViewCaptureReplayDescriptorDataSize = 0;
/* The acceleration structure virtual address backing is coming from a
* buffer, so as long as that buffer is captured/replayed correctly we
* should always get the same address.
*/
props->accelerationStructureCaptureReplayDescriptorDataSize = 0;
props->samplerDescriptorSize = ANV_SAMPLER_STATE_SIZE;
props->combinedImageSamplerDescriptorSize = align(ANV_SURFACE_STATE_SIZE + ANV_SAMPLER_STATE_SIZE,
ANV_SURFACE_STATE_SIZE);
props->sampledImageDescriptorSize = ANV_SURFACE_STATE_SIZE;
props->storageImageDescriptorSize = ANV_SURFACE_STATE_SIZE;
props->uniformTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
props->robustUniformTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
props->storageTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
props->robustStorageTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
props->uniformBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
props->robustUniformBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
props->storageBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
props->robustStorageBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
props->inputAttachmentDescriptorSize = ANV_SURFACE_STATE_SIZE;
props->accelerationStructureDescriptorSize = sizeof(struct anv_address_range_descriptor);
props->maxSamplerDescriptorBufferRange = pdevice->va.descriptor_buffer_pool.size;
props->maxResourceDescriptorBufferRange = anv_physical_device_bindless_heap_size(pdevice,
true);
props->resourceDescriptorBufferAddressSpaceSize = pdevice->va.descriptor_buffer_pool.size;
props->descriptorBufferAddressSpaceSize = pdevice->va.descriptor_buffer_pool.size;
props->samplerDescriptorBufferAddressSpaceSize = pdevice->va.descriptor_buffer_pool.size;
}
/* VK_EXT_extended_dynamic_state3 */
{
props->dynamicPrimitiveTopologyUnrestricted = true;
}
/* VK_EXT_external_memory_host */
{
props->minImportedHostPointerAlignment = 4096;
}
/* VK_EXT_graphics_pipeline_library */
{
props->graphicsPipelineLibraryFastLinking = true;
props->graphicsPipelineLibraryIndependentInterpolationDecoration = true;
}
/* VK_EXT_legacy_vertex_attributes */
{
props->nativeUnalignedPerformance = true;
}
/* VK_EXT_line_rasterization */
{
/* In the Skylake PRM Vol. 7, subsection titled "GIQ (Diamond) Sampling
* Rules - Legacy Mode", it says the following:
*
* "Note that the device divides a pixel into a 16x16 array of
* subpixels, referenced by their upper left corners."
*
* This is the only known reference in the PRMs to the subpixel
* precision of line rasterization and a "16x16 array of subpixels"
* implies 4 subpixel precision bits. Empirical testing has shown that 4
* subpixel precision bits applies to all line rasterization types.
*/
props->lineSubPixelPrecisionBits = 4;
}
/* VK_EXT_map_memory_placed */
{
props->minPlacedMemoryMapAlignment = 4096;
}
/* VK_EXT_mesh_shader */
{
/* Bounded by the maximum representable size in
* 3DSTATE_MESH_SHADER_BODY::SharedLocalMemorySize. Same for Task.
*/
const uint32_t max_slm_size = 64 * 1024;
/* Bounded by the maximum representable size in
* 3DSTATE_MESH_SHADER_BODY::LocalXMaximum. Same for Task.
*/
const uint32_t max_workgroup_size = 1 << 10;
/* 3DMESH_3D limitation. */
const uint32_t max_threadgroup_count = 1 << 22;
/* 3DMESH_3D limitation. */
const uint32_t max_threadgroup_xyz = 65535;
const uint32_t max_urb_size = 64 * 1024;
props->maxTaskWorkGroupTotalCount = max_threadgroup_count;
props->maxTaskWorkGroupCount[0] = max_threadgroup_xyz;
props->maxTaskWorkGroupCount[1] = max_threadgroup_xyz;
props->maxTaskWorkGroupCount[2] = max_threadgroup_xyz;
props->maxTaskWorkGroupInvocations = max_workgroup_size;
props->maxTaskWorkGroupSize[0] = max_workgroup_size;
props->maxTaskWorkGroupSize[1] = max_workgroup_size;
props->maxTaskWorkGroupSize[2] = max_workgroup_size;
/* TUE header with padding */
const uint32_t task_payload_reserved = 32;
props->maxTaskPayloadSize = max_urb_size - task_payload_reserved;
props->maxTaskSharedMemorySize = max_slm_size;
props->maxTaskPayloadAndSharedMemorySize =
props->maxTaskPayloadSize +
props->maxTaskSharedMemorySize;
props->maxMeshWorkGroupTotalCount = max_threadgroup_count;
props->maxMeshWorkGroupCount[0] = max_threadgroup_xyz;
props->maxMeshWorkGroupCount[1] = max_threadgroup_xyz;
props->maxMeshWorkGroupCount[2] = max_threadgroup_xyz;
props->maxMeshWorkGroupInvocations = max_workgroup_size;
props->maxMeshWorkGroupSize[0] = max_workgroup_size;
props->maxMeshWorkGroupSize[1] = max_workgroup_size;
props->maxMeshWorkGroupSize[2] = max_workgroup_size;
props->maxMeshSharedMemorySize = max_slm_size;
props->maxMeshPayloadAndSharedMemorySize =
props->maxTaskPayloadSize +
props->maxMeshSharedMemorySize;
/* Unfortunately spec's formula for the max output size doesn't match our hardware
* (because some per-primitive and per-vertex attributes have alignment restrictions),
* so we have to advertise the minimum value mandated by the spec to not overflow it.
*/
props->maxMeshOutputPrimitives = 256;
props->maxMeshOutputVertices = 256;
/* NumPrim + Primitive Data List */
const uint32_t max_indices_memory =
ALIGN(sizeof(uint32_t) +
sizeof(uint32_t) * props->maxMeshOutputVertices, 32);
props->maxMeshOutputMemorySize = MIN2(max_urb_size - max_indices_memory, 32768);
props->maxMeshPayloadAndOutputMemorySize =
props->maxTaskPayloadSize +
props->maxMeshOutputMemorySize;
props->maxMeshOutputComponents = 128;
/* RTAIndex is 11-bits wide */
props->maxMeshOutputLayers = 1 << 11;
props->maxMeshMultiviewViewCount = 1;
/* Elements in Vertex Data Array must be aligned to 32 bytes (8 dwords). */
props->meshOutputPerVertexGranularity = 8;
/* Elements in Primitive Data Array must be aligned to 32 bytes (8 dwords). */
props->meshOutputPerPrimitiveGranularity = 8;
/* SIMD16 */
props->maxPreferredTaskWorkGroupInvocations = 16;
props->maxPreferredMeshWorkGroupInvocations = 16;
props->prefersLocalInvocationVertexOutput = false;
props->prefersLocalInvocationPrimitiveOutput = false;
props->prefersCompactVertexOutput = false;
props->prefersCompactPrimitiveOutput = false;
/* Spec minimum values */
assert(props->maxTaskWorkGroupTotalCount >= (1U << 22));
assert(props->maxTaskWorkGroupCount[0] >= 65535);
assert(props->maxTaskWorkGroupCount[1] >= 65535);
assert(props->maxTaskWorkGroupCount[2] >= 65535);
assert(props->maxTaskWorkGroupInvocations >= 128);
assert(props->maxTaskWorkGroupSize[0] >= 128);
assert(props->maxTaskWorkGroupSize[1] >= 128);
assert(props->maxTaskWorkGroupSize[2] >= 128);
assert(props->maxTaskPayloadSize >= 16384);
assert(props->maxTaskSharedMemorySize >= 32768);
assert(props->maxTaskPayloadAndSharedMemorySize >= 32768);
assert(props->maxMeshWorkGroupTotalCount >= (1U << 22));
assert(props->maxMeshWorkGroupCount[0] >= 65535);
assert(props->maxMeshWorkGroupCount[1] >= 65535);
assert(props->maxMeshWorkGroupCount[2] >= 65535);
assert(props->maxMeshWorkGroupInvocations >= 128);
assert(props->maxMeshWorkGroupSize[0] >= 128);
assert(props->maxMeshWorkGroupSize[1] >= 128);
assert(props->maxMeshWorkGroupSize[2] >= 128);
assert(props->maxMeshSharedMemorySize >= 28672);
assert(props->maxMeshPayloadAndSharedMemorySize >= 28672);
assert(props->maxMeshOutputMemorySize >= 32768);
assert(props->maxMeshPayloadAndOutputMemorySize >= 48128);
assert(props->maxMeshOutputComponents >= 128);
assert(props->maxMeshOutputVertices >= 256);
assert(props->maxMeshOutputPrimitives >= 256);
assert(props->maxMeshOutputLayers >= 8);
assert(props->maxMeshMultiviewViewCount >= 1);
}
/* VK_EXT_multi_draw */
{
props->maxMultiDrawCount = 2048;
}
/* VK_EXT_nested_command_buffer */
{
props->maxCommandBufferNestingLevel = UINT32_MAX;
}
/* VK_EXT_pci_bus_info */
{
props->pciDomain = pdevice->info.pci_domain;
props->pciBus = pdevice->info.pci_bus;
props->pciDevice = pdevice->info.pci_dev;
props->pciFunction = pdevice->info.pci_func;
}
/* VK_EXT_physical_device_drm */
{
props->drmHasPrimary = pdevice->has_master;
props->drmPrimaryMajor = pdevice->master_major;
props->drmPrimaryMinor = pdevice->master_minor;
props->drmHasRender = pdevice->has_local;
props->drmRenderMajor = pdevice->local_major;
props->drmRenderMinor = pdevice->local_minor;
}
/* VK_EXT_pipeline_robustness */
{
props->defaultRobustnessStorageBuffers =
VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT;
props->defaultRobustnessUniformBuffers =
VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT;
props->defaultRobustnessVertexInputs =
VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT;
props->defaultRobustnessImages =
VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT;
}
/* VK_EXT_provoking_vertex */
{
props->provokingVertexModePerPipeline = true;
props->transformFeedbackPreservesTriangleFanProvokingVertex = false;
}
/* VK_EXT_robustness2 */
{
props->robustStorageBufferAccessSizeAlignment =
ANV_SSBO_BOUNDS_CHECK_ALIGNMENT;
props->robustUniformBufferAccessSizeAlignment =
ANV_UBO_ALIGNMENT;
}
/* VK_EXT_sample_locations */
{
props->sampleLocationSampleCounts =
isl_device_get_sample_counts(&pdevice->isl_dev);
/* See also anv_GetPhysicalDeviceMultisamplePropertiesEXT */
props->maxSampleLocationGridSize.width = 1;
props->maxSampleLocationGridSize.height = 1;
props->sampleLocationCoordinateRange[0] = 0;
props->sampleLocationCoordinateRange[1] = 0.9375;
props->sampleLocationSubPixelBits = 4;
props->variableSampleLocations = true;
}
/* VK_EXT_shader_module_identifier */
{
STATIC_ASSERT(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) ==
sizeof(props->shaderModuleIdentifierAlgorithmUUID));
memcpy(props->shaderModuleIdentifierAlgorithmUUID,
vk_shaderModuleIdentifierAlgorithmUUID,
sizeof(props->shaderModuleIdentifierAlgorithmUUID));
}
/* VK_EXT_transform_feedback */
{
props->maxTransformFeedbackStreams = MAX_XFB_STREAMS;
props->maxTransformFeedbackBuffers = MAX_XFB_BUFFERS;
props->maxTransformFeedbackBufferSize = (1ull << 32);
props->maxTransformFeedbackStreamDataSize = 128 * 4;
props->maxTransformFeedbackBufferDataSize = 128 * 4;
props->maxTransformFeedbackBufferDataStride = 2048;
props->transformFeedbackQueries = true;
props->transformFeedbackStreamsLinesTriangles = false;
props->transformFeedbackRasterizationStreamSelect = false;
props->transformFeedbackDraw = true;
}
/* VK_ANDROID_native_buffer */
#if DETECT_OS_ANDROID
{
props->sharedImage = front_rendering_usage ? VK_TRUE : VK_FALSE;
}
#endif /* DETECT_OS_ANDROID */
}
static VkResult MUST_CHECK
anv_init_meminfo(struct anv_physical_device *device, int fd)
{
const struct intel_device_info *devinfo = &device->info;
device->sys.region = &devinfo->mem.sram.mem;
device->sys.size = devinfo->mem.sram.mappable.size;
device->sys.available = devinfo->mem.sram.mappable.free;
device->vram_mappable.region = &devinfo->mem.vram.mem;
device->vram_mappable.size = devinfo->mem.vram.mappable.size;
device->vram_mappable.available = devinfo->mem.vram.mappable.free;
device->vram_non_mappable.region = &devinfo->mem.vram.mem;
device->vram_non_mappable.size = devinfo->mem.vram.unmappable.size;
device->vram_non_mappable.available = devinfo->mem.vram.unmappable.free;
return VK_SUCCESS;
}
static void
anv_update_meminfo(struct anv_physical_device *device, int fd)
{
if (!intel_device_info_update_memory_info(&device->info, fd))
return;
const struct intel_device_info *devinfo = &device->info;
device->sys.available = devinfo->mem.sram.mappable.free;
device->vram_mappable.available = devinfo->mem.vram.mappable.free;
device->vram_non_mappable.available = devinfo->mem.vram.unmappable.free;
}
static VkResult
anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
{
VkResult result = anv_init_meminfo(device, fd);
if (result != VK_SUCCESS)
return result;
assert(device->sys.size != 0);
if (anv_physical_device_has_vram(device)) {
/* We can create 2 or 3 different heaps when we have local memory
* support, first heap with local memory size and second with system
* memory size and the third is added only if part of the vram is
* mappable to the host.
*/
device->memory.heap_count = 2;
device->memory.heaps[0] = (struct anv_memory_heap) {
/* If there is a vram_non_mappable, use that for the device only
* heap. Otherwise use the vram_mappable.
*/
.size = device->vram_non_mappable.size != 0 ?
device->vram_non_mappable.size : device->vram_mappable.size,
.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
.is_local_mem = true,
};
device->memory.heaps[1] = (struct anv_memory_heap) {
.size = device->sys.size,
.flags = 0,
.is_local_mem = false,
};
/* Add an additional smaller vram mappable heap if we can't map all the
* vram to the host.
*/
if (device->vram_non_mappable.size > 0) {
device->memory.heap_count++;
device->memory.heaps[2] = (struct anv_memory_heap) {
.size = device->vram_mappable.size,
.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
.is_local_mem = true,
};
}
} else {
device->memory.heap_count = 1;
device->memory.heaps[0] = (struct anv_memory_heap) {
.size = device->sys.size,
.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
.is_local_mem = false,
};
}
switch (device->info.kmd_type) {
case INTEL_KMD_TYPE_XE:
result = anv_xe_physical_device_init_memory_types(device);
break;
case INTEL_KMD_TYPE_I915:
default:
result = anv_i915_physical_device_init_memory_types(device);
break;
}
if (result != VK_SUCCESS)
return result;
/* Replicate all non protected memory types for descriptor buffers because
* we want to identify memory allocations to place them in the right memory
* heap.
*/
device->memory.default_buffer_mem_types =
BITFIELD_RANGE(0, device->memory.type_count);
device->memory.protected_mem_types = 0;
device->memory.desc_buffer_mem_types = 0;
uint32_t base_types_count = device->memory.type_count;
for (int i = 0; i < base_types_count; i++) {
if (device->memory.types[i].propertyFlags &
VK_MEMORY_PROPERTY_PROTECTED_BIT) {
device->memory.protected_mem_types |= BITFIELD_BIT(i);
device->memory.default_buffer_mem_types &= (~BITFIELD_BIT(i));
continue;
}
assert(device->memory.type_count < ARRAY_SIZE(device->memory.types));
device->memory.desc_buffer_mem_types |=
BITFIELD_BIT(device->memory.type_count);
struct anv_memory_type *new_type =
&device->memory.types[device->memory.type_count++];
*new_type = device->memory.types[i];
new_type->descriptor_buffer = true;
}
for (unsigned i = 0; i < device->memory.type_count; i++) {
VkMemoryPropertyFlags props = device->memory.types[i].propertyFlags;
if ((props & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) &&
!(props & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
device->memory.need_flush = true;
#else
return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
"Memory configuration requires flushing, but it's not implemented for this architecture");
#endif
}
return VK_SUCCESS;
}
static VkResult
anv_physical_device_init_uuids(struct anv_physical_device *device)
{
const struct build_id_note *note =
build_id_find_nhdr_for_addr(anv_physical_device_init_uuids);
if (!note) {
return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
"Failed to find build-id");
}
unsigned build_id_len = build_id_length(note);
if (build_id_len < 20) {
return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
"build-id too short. It needs to be a SHA");
}
memcpy(device->driver_build_sha1, build_id_data(note), 20);
struct mesa_sha1 sha1_ctx;
uint8_t sha1[20];
STATIC_ASSERT(VK_UUID_SIZE <= sizeof(sha1));
/* The pipeline cache UUID is used for determining when a pipeline cache is
* invalid. It needs both a driver build and the PCI ID of the device.
*/
_mesa_sha1_init(&sha1_ctx);
_mesa_sha1_update(&sha1_ctx, build_id_data(note), build_id_len);
brw_device_sha1_update(&sha1_ctx, &device->info);
_mesa_sha1_update(&sha1_ctx, &device->always_use_bindless,
sizeof(device->always_use_bindless));
_mesa_sha1_final(&sha1_ctx, sha1);
memcpy(device->pipeline_cache_uuid, sha1, VK_UUID_SIZE);
intel_uuid_compute_driver_id(device->driver_uuid, &device->info, VK_UUID_SIZE);
intel_uuid_compute_device_id(device->device_uuid, &device->info, VK_UUID_SIZE);
return VK_SUCCESS;
}
static void
anv_physical_device_init_disk_cache(struct anv_physical_device *device)
{
#ifdef ENABLE_SHADER_CACHE
char renderer[10];
ASSERTED int len = snprintf(renderer, sizeof(renderer), "anv_%04x",
device->info.pci_device_id);
assert(len == sizeof(renderer) - 2);
char timestamp[41];
_mesa_sha1_format(timestamp, device->driver_build_sha1);
const uint64_t driver_flags =
brw_get_compiler_config_value(device->compiler);
device->vk.disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
#endif
}
static void
anv_physical_device_free_disk_cache(struct anv_physical_device *device)
{
#ifdef ENABLE_SHADER_CACHE
if (device->vk.disk_cache) {
disk_cache_destroy(device->vk.disk_cache);
device->vk.disk_cache = NULL;
}
#else
assert(device->vk.disk_cache == NULL);
#endif
}
/* The ANV_QUEUE_OVERRIDE environment variable is a comma separated list of
* queue overrides.
*
* To override the number queues:
* * "gc" is for graphics queues with compute support
* * "g" is for graphics queues with no compute support
* * "c" is for compute queues with no graphics support
* * "v" is for video queues with no graphics support
*
* For example, ANV_QUEUE_OVERRIDE=gc=2,c=1 would override the number of
* advertised queues to be 2 queues with graphics+compute support, and 1 queue
* with compute-only support.
*
* ANV_QUEUE_OVERRIDE=c=1 would override the number of advertised queues to
* include 1 queue with compute-only support, but it will not change the
* number of graphics+compute queues.
*
* ANV_QUEUE_OVERRIDE=gc=0,c=1 would override the number of advertised queues
* to include 1 queue with compute-only support, and it would override the
* number of graphics+compute queues to be 0.
*/
static void
anv_override_engine_counts(int *gc_count, int *g_count, int *c_count, int *v_count)
{
int gc_override = -1;
int g_override = -1;
int c_override = -1;
int v_override = -1;
char *env = getenv("ANV_QUEUE_OVERRIDE");
if (env == NULL)
return;
env = strdup(env);
char *save = NULL;
char *next = strtok_r(env, ",", &save);
while (next != NULL) {
if (strncmp(next, "gc=", 3) == 0) {
gc_override = strtol(next + 3, NULL, 0);
} else if (strncmp(next, "g=", 2) == 0) {
g_override = strtol(next + 2, NULL, 0);
} else if (strncmp(next, "c=", 2) == 0) {
c_override = strtol(next + 2, NULL, 0);
} else if (strncmp(next, "v=", 2) == 0) {
v_override = strtol(next + 2, NULL, 0);
} else {
mesa_logw("Ignoring unsupported ANV_QUEUE_OVERRIDE token: %s", next);
}
next = strtok_r(NULL, ",", &save);
}
free(env);
if (gc_override >= 0)
*gc_count = gc_override;
if (g_override >= 0)
*g_count = g_override;
if (*g_count > 0 && *gc_count <= 0 && (gc_override >= 0 || g_override >= 0))
mesa_logw("ANV_QUEUE_OVERRIDE: gc=0 with g > 0 violates the "
"Vulkan specification");
if (c_override >= 0)
*c_count = c_override;
if (v_override >= 0)
*v_count = v_override;
}
static void
anv_physical_device_init_queue_families(struct anv_physical_device *pdevice)
{
uint32_t family_count = 0;
VkQueueFlags sparse_flags = pdevice->sparse_type != ANV_SPARSE_TYPE_NOT_SUPPORTED ?
VK_QUEUE_SPARSE_BINDING_BIT : 0;
VkQueueFlags protected_flag = pdevice->has_protected_contexts ?
VK_QUEUE_PROTECTED_BIT : 0;
if (pdevice->engine_info) {
int gc_count =
intel_engines_count(pdevice->engine_info,
INTEL_ENGINE_CLASS_RENDER);
int v_count =
intel_engines_count(pdevice->engine_info, INTEL_ENGINE_CLASS_VIDEO);
int g_count = 0;
int c_count = 0;
const bool kernel_supports_non_render_engines = pdevice->has_vm_control;
const bool sparse_supports_non_render_engines =
pdevice->sparse_type != ANV_SPARSE_TYPE_TRTT;
const bool can_use_non_render_engines =
kernel_supports_non_render_engines &&
sparse_supports_non_render_engines;
if (can_use_non_render_engines) {
c_count = intel_engines_supported_count(pdevice->local_fd,
&pdevice->info,
pdevice->engine_info,
INTEL_ENGINE_CLASS_COMPUTE);
}
enum intel_engine_class compute_class =
c_count < 1 ? INTEL_ENGINE_CLASS_RENDER : INTEL_ENGINE_CLASS_COMPUTE;
int blit_count = 0;
if (pdevice->info.verx10 >= 125 && can_use_non_render_engines) {
blit_count = intel_engines_supported_count(pdevice->local_fd,
&pdevice->info,
pdevice->engine_info,
INTEL_ENGINE_CLASS_COPY);
}
anv_override_engine_counts(&gc_count, &g_count, &c_count, &v_count);
if (gc_count > 0) {
pdevice->queue.families[family_count++] = (struct anv_queue_family) {
.queueFlags = VK_QUEUE_GRAPHICS_BIT |
VK_QUEUE_COMPUTE_BIT |
VK_QUEUE_TRANSFER_BIT |
sparse_flags |
protected_flag,
.queueCount = gc_count,
.engine_class = INTEL_ENGINE_CLASS_RENDER,
};
}
if (g_count > 0) {
pdevice->queue.families[family_count++] = (struct anv_queue_family) {
.queueFlags = VK_QUEUE_GRAPHICS_BIT |
VK_QUEUE_TRANSFER_BIT |
sparse_flags |
protected_flag,
.queueCount = g_count,
.engine_class = INTEL_ENGINE_CLASS_RENDER,
};
}
if (c_count > 0) {
pdevice->queue.families[family_count++] = (struct anv_queue_family) {
.queueFlags = VK_QUEUE_COMPUTE_BIT |
VK_QUEUE_TRANSFER_BIT |
sparse_flags |
protected_flag,
.queueCount = c_count,
.engine_class = compute_class,
};
}
if (v_count > 0 && pdevice->video_decode_enabled) {
/* HEVC support on Gfx9 is only available on VCS0. So limit the number of video queues
* to the first VCS engine instance.
*
* We should be able to query HEVC support from the kernel using the engine query uAPI,
* but this appears to be broken :
* https://gitlab.freedesktop.org/drm/intel/-/issues/8832
*
* When this bug is fixed we should be able to check HEVC support to determine the
* correct number of queues.
*/
/* TODO: enable protected content on video queue */
pdevice->queue.families[family_count++] = (struct anv_queue_family) {
.queueFlags = VK_QUEUE_VIDEO_DECODE_BIT_KHR,
.queueCount = pdevice->info.ver == 9 ? MIN2(1, v_count) : v_count,
.engine_class = INTEL_ENGINE_CLASS_VIDEO,
};
}
if (blit_count > 0) {
pdevice->queue.families[family_count++] = (struct anv_queue_family) {
.queueFlags = VK_QUEUE_TRANSFER_BIT |
protected_flag,
.queueCount = blit_count,
.engine_class = INTEL_ENGINE_CLASS_COPY,
};
}
} else {
/* Default to a single render queue */
pdevice->queue.families[family_count++] = (struct anv_queue_family) {
.queueFlags = VK_QUEUE_GRAPHICS_BIT |
VK_QUEUE_COMPUTE_BIT |
VK_QUEUE_TRANSFER_BIT |
sparse_flags,
.queueCount = 1,
.engine_class = INTEL_ENGINE_CLASS_RENDER,
};
family_count = 1;
}
assert(family_count <= ANV_MAX_QUEUE_FAMILIES);
pdevice->queue.family_count = family_count;
}
static VkResult
anv_physical_device_get_parameters(struct anv_physical_device *device)
{
switch (device->info.kmd_type) {
case INTEL_KMD_TYPE_I915:
return anv_i915_physical_device_get_parameters(device);
case INTEL_KMD_TYPE_XE:
return anv_xe_physical_device_get_parameters(device);
default:
unreachable("Missing");
return VK_ERROR_UNKNOWN;
}
}
static VkResult
anv_physical_device_try_create(struct vk_instance *vk_instance,
struct _drmDevice *drm_device,
struct vk_physical_device **out)
{
struct anv_instance *instance =
container_of(vk_instance, struct anv_instance, vk);
if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER)) ||
drm_device->bustype != DRM_BUS_PCI ||
drm_device->deviceinfo.pci->vendor_id != 0x8086)
return VK_ERROR_INCOMPATIBLE_DRIVER;
const char *primary_path = drm_device->nodes[DRM_NODE_PRIMARY];
const char *path = drm_device->nodes[DRM_NODE_RENDER];
VkResult result;
int fd;
int master_fd = -1;
process_intel_debug_variable();
fd = open(path, O_RDWR | O_CLOEXEC);
if (fd < 0) {
if (errno == ENOMEM) {
return vk_errorf(instance, VK_ERROR_OUT_OF_HOST_MEMORY,
"Unable to open device %s: out of memory", path);
}
return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
"Unable to open device %s: %m", path);
}
struct intel_device_info devinfo;
if (!intel_get_device_info_from_fd(fd, &devinfo, 9, -1)) {
result = VK_ERROR_INCOMPATIBLE_DRIVER;
goto fail_fd;
}
if (devinfo.ver == 20) {
mesa_logw("Vulkan not yet supported on %s", devinfo.name);
} else if (devinfo.ver > 12) {
result = vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
"Vulkan not yet supported on %s", devinfo.name);
goto fail_fd;
} else if (devinfo.ver < 9) {
/* Silently fail here, hasvk should pick up this device. */
result = VK_ERROR_INCOMPATIBLE_DRIVER;
goto fail_fd;
}
/* Disable Wa_16013994831 on Gfx12.0 because we found other cases where we
* need to always disable preemption :
* - https://gitlab.freedesktop.org/mesa/mesa/-/issues/5963
* - https://gitlab.freedesktop.org/mesa/mesa/-/issues/5662
*/
if (devinfo.verx10 == 120)
BITSET_CLEAR(devinfo.workarounds, INTEL_WA_16013994831);
if (!devinfo.has_context_isolation) {
result = vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
"Vulkan requires context isolation for %s", devinfo.name);
goto fail_fd;
}
struct anv_physical_device *device =
vk_zalloc(&instance->vk.alloc, sizeof(*device), 8,
VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
if (device == NULL) {
result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_fd;
}
struct vk_physical_device_dispatch_table dispatch_table;
vk_physical_device_dispatch_table_from_entrypoints(
&dispatch_table, &anv_physical_device_entrypoints, true);
vk_physical_device_dispatch_table_from_entrypoints(
&dispatch_table, &wsi_physical_device_entrypoints, false);
result = vk_physical_device_init(&device->vk, &instance->vk,
NULL, NULL, NULL, /* We set up extensions later */
&dispatch_table);
if (result != VK_SUCCESS) {
vk_error(instance, result);
goto fail_alloc;
}
device->instance = instance;
assert(strlen(path) < ARRAY_SIZE(device->path));
snprintf(device->path, ARRAY_SIZE(device->path), "%s", path);
device->info = devinfo;
device->local_fd = fd;
result = anv_physical_device_get_parameters(device);
if (result != VK_SUCCESS)
goto fail_base;
device->gtt_size = device->info.gtt_size ? device->info.gtt_size :
device->info.aperture_bytes;
if (device->gtt_size < (4ULL << 30 /* GiB */)) {
vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
"GTT size too small: 0x%016"PRIx64, device->gtt_size);
goto fail_base;
}
/* We currently only have the right bits for instructions in Gen12+. If the
* kernel ever starts supporting that feature on previous generations,
* we'll need to edit genxml prior to enabling here.
*/
device->has_protected_contexts = device->info.ver >= 12 &&
intel_gem_supports_protected_context(fd, device->info.kmd_type);
/* Just pick one; they're all the same */
device->has_astc_ldr =
isl_format_supports_sampling(&device->info,
ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16);
if (!device->has_astc_ldr &&
driQueryOptionb(&device->instance->dri_options, "vk_require_astc"))
device->emu_astc_ldr = true;
if (devinfo.ver == 9 && !intel_device_info_is_9lp(&devinfo)) {
device->flush_astc_ldr_void_extent_denorms =
device->has_astc_ldr && !device->emu_astc_ldr;
}
device->disable_fcv = device->info.verx10 >= 125 ||
instance->disable_fcv;
result = anv_physical_device_init_heaps(device, fd);
if (result != VK_SUCCESS)
goto fail_base;
if (debug_get_bool_option("ANV_QUEUE_THREAD_DISABLE", false))
device->has_exec_timeline = false;
device->has_cooperative_matrix =
device->info.cooperative_matrix_configurations[0].scope != INTEL_CMAT_SCOPE_NONE;
unsigned st_idx = 0;
device->sync_syncobj_type = vk_drm_syncobj_get_type(fd);
if (!device->has_exec_timeline)
device->sync_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE;
device->sync_types[st_idx++] = &device->sync_syncobj_type;
/* anv_bo_sync_type is only supported with i915 for now */
if (device->info.kmd_type == INTEL_KMD_TYPE_I915) {
if (!(device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT))
device->sync_types[st_idx++] = &anv_bo_sync_type;
if (!(device->sync_syncobj_type.features & VK_SYNC_FEATURE_TIMELINE)) {
device->sync_timeline_type = vk_sync_timeline_get_type(&anv_bo_sync_type);
device->sync_types[st_idx++] = &device->sync_timeline_type.sync;
}
} else {
assert(vk_sync_type_is_drm_syncobj(&device->sync_syncobj_type));
assert(device->sync_syncobj_type.features & VK_SYNC_FEATURE_TIMELINE);
assert(device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT);
}
device->sync_types[st_idx++] = NULL;
assert(st_idx <= ARRAY_SIZE(device->sync_types));
device->vk.supported_sync_types = device->sync_types;
device->vk.pipeline_cache_import_ops = anv_cache_import_ops;
device->always_use_bindless =
debug_get_bool_option("ANV_ALWAYS_BINDLESS", false);
device->use_call_secondary =
!debug_get_bool_option("ANV_DISABLE_SECONDARY_CMD_BUFFER_CALLS", false);
device->video_decode_enabled = debug_get_bool_option("ANV_VIDEO_DECODE", false);
device->uses_ex_bso = device->info.verx10 >= 125;
/* For now always use indirect descriptors. We'll update this
* to !uses_ex_bso when all the infrastructure is built up.
*/
device->indirect_descriptors =
!device->uses_ex_bso ||
driQueryOptionb(&instance->dri_options, "force_indirect_descriptors");
device->alloc_aux_tt_mem =
device->info.has_aux_map && device->info.verx10 >= 125;
/* Check if we can read the GPU timestamp register from the CPU */
uint64_t u64_ignore;
device->has_reg_timestamp = intel_gem_read_render_timestamp(fd,
device->info.kmd_type,
&u64_ignore);
device->uses_relocs = device->info.kmd_type != INTEL_KMD_TYPE_XE;
/* While xe.ko can use both vm_bind and TR-TT, i915.ko only has TR-TT. */
if (device->info.kmd_type == INTEL_KMD_TYPE_XE) {
if (debug_get_bool_option("ANV_SPARSE_USE_TRTT", false))
device->sparse_type = ANV_SPARSE_TYPE_TRTT;
else
device->sparse_type = ANV_SPARSE_TYPE_VM_BIND;
} else {
if (device->info.ver >= 12 &&
device->has_exec_timeline &&
debug_get_bool_option("ANV_SPARSE", true)) {
device->sparse_type = ANV_SPARSE_TYPE_TRTT;
} else if (instance->has_fake_sparse) {
device->sparse_type = ANV_SPARSE_TYPE_FAKE;
} else {
device->sparse_type = ANV_SPARSE_TYPE_NOT_SUPPORTED;
}
}
device->always_flush_cache = INTEL_DEBUG(DEBUG_STALL) ||
driQueryOptionb(&instance->dri_options, "always_flush_cache");
device->compiler = brw_compiler_create(NULL, &device->info);
if (device->compiler == NULL) {
result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_base;
}
device->compiler->shader_debug_log = compiler_debug_log;
device->compiler->shader_perf_log = compiler_perf_log;
device->compiler->indirect_ubos_use_sampler = device->info.ver < 12;
device->compiler->extended_bindless_surface_offset = device->uses_ex_bso;
device->compiler->use_bindless_sampler_offset = false;
device->compiler->spilling_rate =
driQueryOptioni(&instance->dri_options, "shader_spilling_rate");
isl_device_init(&device->isl_dev, &device->info);
device->isl_dev.buffer_length_in_aux_addr = true;
result = anv_physical_device_init_uuids(device);
if (result != VK_SUCCESS)
goto fail_compiler;
anv_physical_device_init_va_ranges(device);
anv_physical_device_init_disk_cache(device);
if (instance->vk.enabled_extensions.KHR_display) {
master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
if (master_fd >= 0) {
/* fail if we don't have permission to even render on this device */
if (!intel_gem_can_render_on_fd(master_fd, device->info.kmd_type)) {
close(master_fd);
master_fd = -1;
}
}
}
device->master_fd = master_fd;
device->engine_info = intel_engine_get_info(fd, device->info.kmd_type);
device->info.has_compute_engine = device->engine_info &&
intel_engines_count(device->engine_info,
INTEL_ENGINE_CLASS_COMPUTE);
anv_physical_device_init_queue_families(device);
anv_physical_device_init_perf(device, fd);
/* Gather major/minor before WSI. */
struct stat st;
if (stat(primary_path, &st) == 0) {
device->has_master = true;
device->master_major = major(st.st_rdev);
device->master_minor = minor(st.st_rdev);
} else {
device->has_master = false;
device->master_major = 0;
device->master_minor = 0;
}
if (stat(path, &st) == 0) {
device->has_local = true;
device->local_major = major(st.st_rdev);
device->local_minor = minor(st.st_rdev);
} else {
device->has_local = false;
device->local_major = 0;
device->local_minor = 0;
}
get_device_extensions(device, &device->vk.supported_extensions);
get_features(device, &device->vk.supported_features);
get_properties(device, &device->vk.properties);
result = anv_init_wsi(device);
if (result != VK_SUCCESS)
goto fail_perf;
anv_measure_device_init(device);
anv_genX(&device->info, init_physical_device_state)(device);
*out = &device->vk;
return VK_SUCCESS;
fail_perf:
intel_perf_free(device->perf);
free(device->engine_info);
anv_physical_device_free_disk_cache(device);
fail_compiler:
ralloc_free(device->compiler);
fail_base:
vk_physical_device_finish(&device->vk);
fail_alloc:
vk_free(&instance->vk.alloc, device);
fail_fd:
close(fd);
if (master_fd != -1)
close(master_fd);
return result;
}
static void
anv_physical_device_destroy(struct vk_physical_device *vk_device)
{
struct anv_physical_device *device =
container_of(vk_device, struct anv_physical_device, vk);
anv_finish_wsi(device);
anv_measure_device_destroy(device);
free(device->engine_info);
anv_physical_device_free_disk_cache(device);
ralloc_free(device->compiler);
intel_perf_free(device->perf);
close(device->local_fd);
if (device->master_fd >= 0)
close(device->master_fd);
vk_physical_device_finish(&device->vk);
vk_free(&device->instance->vk.alloc, device);
}
VkResult anv_EnumerateInstanceExtensionProperties(
const char* pLayerName,
uint32_t* pPropertyCount,
VkExtensionProperties* pProperties)
{
if (pLayerName)
return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
return vk_enumerate_instance_extension_properties(
&instance_extensions, pPropertyCount, pProperties);
}
static void
anv_init_dri_options(struct anv_instance *instance)
{
driParseOptionInfo(&instance->available_dri_options, anv_dri_options,
ARRAY_SIZE(anv_dri_options));
driParseConfigFiles(&instance->dri_options,
&instance->available_dri_options, 0, "anv", NULL, NULL,
instance->vk.app_info.app_name,
instance->vk.app_info.app_version,
instance->vk.app_info.engine_name,
instance->vk.app_info.engine_version);
instance->assume_full_subgroups =
driQueryOptioni(&instance->dri_options, "anv_assume_full_subgroups");
instance->limit_trig_input_range =
driQueryOptionb(&instance->dri_options, "limit_trig_input_range");
instance->sample_mask_out_opengl_behaviour =
driQueryOptionb(&instance->dri_options, "anv_sample_mask_out_opengl_behaviour");
instance->force_filter_addr_rounding =
driQueryOptionb(&instance->dri_options, "anv_force_filter_addr_rounding");
instance->lower_depth_range_rate =
driQueryOptionf(&instance->dri_options, "lower_depth_range_rate");
instance->no_16bit =
driQueryOptionb(&instance->dri_options, "no_16bit");
instance->intel_enable_wa_14018912822 =
driQueryOptionb(&instance->dri_options, "intel_enable_wa_14018912822");
instance->mesh_conv_prim_attrs_to_vert_attrs =
driQueryOptioni(&instance->dri_options, "anv_mesh_conv_prim_attrs_to_vert_attrs");
instance->fp64_workaround_enabled =
driQueryOptionb(&instance->dri_options, "fp64_workaround_enabled");
instance->generated_indirect_threshold =
driQueryOptioni(&instance->dri_options, "generated_indirect_threshold");
instance->generated_indirect_ring_threshold =
driQueryOptioni(&instance->dri_options, "generated_indirect_ring_threshold");
instance->query_clear_with_blorp_threshold =
driQueryOptioni(&instance->dri_options, "query_clear_with_blorp_threshold");
instance->query_copy_with_shader_threshold =
driQueryOptioni(&instance->dri_options, "query_copy_with_shader_threshold");
instance->force_vk_vendor =
driQueryOptioni(&instance->dri_options, "force_vk_vendor");
instance->has_fake_sparse =
driQueryOptionb(&instance->dri_options, "fake_sparse");
instance->enable_tbimr = driQueryOptionb(&instance->dri_options, "intel_tbimr");
instance->disable_fcv =
driQueryOptionb(&instance->dri_options, "anv_disable_fcv");
instance->external_memory_implicit_sync =
driQueryOptionb(&instance->dri_options, "anv_external_memory_implicit_sync");
instance->compression_control_enabled =
driQueryOptionb(&instance->dri_options, "compression_control_enabled");
}
VkResult anv_CreateInstance(
const VkInstanceCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator,
VkInstance* pInstance)
{
struct anv_instance *instance;
VkResult result;
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO);
if (pAllocator == NULL)
pAllocator = vk_default_allocator();
instance = vk_alloc(pAllocator, sizeof(*instance), 8,
VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
if (!instance)
return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
struct vk_instance_dispatch_table dispatch_table;
vk_instance_dispatch_table_from_entrypoints(
&dispatch_table, &anv_instance_entrypoints, true);
vk_instance_dispatch_table_from_entrypoints(
&dispatch_table, &wsi_instance_entrypoints, false);
result = vk_instance_init(&instance->vk, &instance_extensions,
&dispatch_table, pCreateInfo, pAllocator);
if (result != VK_SUCCESS) {
vk_free(pAllocator, instance);
return vk_error(NULL, result);
}
instance->vk.physical_devices.try_create_for_drm = anv_physical_device_try_create;
instance->vk.physical_devices.destroy = anv_physical_device_destroy;
VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
anv_init_dri_options(instance);
intel_driver_ds_init();
*pInstance = anv_instance_to_handle(instance);
return VK_SUCCESS;
}
void anv_DestroyInstance(
VkInstance _instance,
const VkAllocationCallbacks* pAllocator)
{
ANV_FROM_HANDLE(anv_instance, instance, _instance);
if (!instance)
return;
VG(VALGRIND_DESTROY_MEMPOOL(instance));
driDestroyOptionCache(&instance->dri_options);
driDestroyOptionInfo(&instance->available_dri_options);
vk_instance_finish(&instance->vk);
vk_free(&instance->vk.alloc, instance);
}
static const VkQueueFamilyProperties
get_anv_queue_family_properties_template(const struct anv_physical_device *device)
{
/*
* For Xe2+:
* Bspec 60411: Timestamp register can hold 64-bit value
*
* Platforms < Xe2:
* Bpsec 46111: Timestamp register can hold only 36-bit
* value
*/
const VkQueueFamilyProperties anv_queue_family_properties_template =
{
.timestampValidBits = device->info.ver >= 20 ? 64 : 36,
.minImageTransferGranularity = { 1, 1, 1 },
};
return anv_queue_family_properties_template;
}
static VkQueueFamilyProperties
anv_device_physical_get_queue_properties(const struct anv_physical_device *device,
uint32_t family_index)
{
const struct anv_queue_family *family = &device->queue.families[family_index];
VkQueueFamilyProperties properties =
get_anv_queue_family_properties_template(device);
properties.queueFlags = family->queueFlags;
properties.queueCount = family->queueCount;
return properties;
}
void anv_GetPhysicalDeviceQueueFamilyProperties2(
VkPhysicalDevice physicalDevice,
uint32_t* pQueueFamilyPropertyCount,
VkQueueFamilyProperties2* pQueueFamilyProperties)
{
ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out,
pQueueFamilyProperties, pQueueFamilyPropertyCount);
for (uint32_t i = 0; i < pdevice->queue.family_count; i++) {
struct anv_queue_family *queue_family = &pdevice->queue.families[i];
vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) {
p->queueFamilyProperties =
anv_device_physical_get_queue_properties(pdevice, i);
vk_foreach_struct(ext, p->pNext) {
switch (ext->sType) {
case VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR: {
VkQueueFamilyGlobalPriorityPropertiesKHR *properties =
(VkQueueFamilyGlobalPriorityPropertiesKHR *)ext;
/* Deliberately sorted low to high */
VkQueueGlobalPriorityKHR all_priorities[] = {
VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR,
VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR,
VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR,
};
uint32_t count = 0;
for (unsigned i = 0; i < ARRAY_SIZE(all_priorities); i++) {
if (all_priorities[i] > pdevice->max_context_priority)
break;
properties->priorities[count++] = all_priorities[i];
}
properties->priorityCount = count;
break;
}
case VK_STRUCTURE_TYPE_QUEUE_FAMILY_QUERY_RESULT_STATUS_PROPERTIES_KHR: {
VkQueueFamilyQueryResultStatusPropertiesKHR *prop =
(VkQueueFamilyQueryResultStatusPropertiesKHR *)ext;
prop->queryResultStatusSupport = VK_TRUE;
break;
}
case VK_STRUCTURE_TYPE_QUEUE_FAMILY_VIDEO_PROPERTIES_KHR: {
VkQueueFamilyVideoPropertiesKHR *prop =
(VkQueueFamilyVideoPropertiesKHR *)ext;
if (queue_family->queueFlags & VK_QUEUE_VIDEO_DECODE_BIT_KHR) {
prop->videoCodecOperations = VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR |
VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR;
}
break;
}
default:
vk_debug_ignored_stype(ext->sType);
}
}
}
}
}
void anv_GetPhysicalDeviceMemoryProperties(
VkPhysicalDevice physicalDevice,
VkPhysicalDeviceMemoryProperties* pMemoryProperties)
{
ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
pMemoryProperties->memoryTypeCount = physical_device->memory.type_count;
for (uint32_t i = 0; i < physical_device->memory.type_count; i++) {
pMemoryProperties->memoryTypes[i] = (VkMemoryType) {
.propertyFlags = physical_device->memory.types[i].propertyFlags,
.heapIndex = physical_device->memory.types[i].heapIndex,
};
}
pMemoryProperties->memoryHeapCount = physical_device->memory.heap_count;
for (uint32_t i = 0; i < physical_device->memory.heap_count; i++) {
pMemoryProperties->memoryHeaps[i] = (VkMemoryHeap) {
.size = physical_device->memory.heaps[i].size,
.flags = physical_device->memory.heaps[i].flags,
};
}
}
static void
anv_get_memory_budget(VkPhysicalDevice physicalDevice,
VkPhysicalDeviceMemoryBudgetPropertiesEXT *memoryBudget)
{
ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
if (!device->vk.supported_extensions.EXT_memory_budget)
return;
anv_update_meminfo(device, device->local_fd);
VkDeviceSize total_sys_heaps_size = 0, total_vram_heaps_size = 0;
for (size_t i = 0; i < device->memory.heap_count; i++) {
if (device->memory.heaps[i].is_local_mem) {
total_vram_heaps_size += device->memory.heaps[i].size;
} else {
total_sys_heaps_size += device->memory.heaps[i].size;
}
}
for (size_t i = 0; i < device->memory.heap_count; i++) {
VkDeviceSize heap_size = device->memory.heaps[i].size;
VkDeviceSize heap_used = device->memory.heaps[i].used;
VkDeviceSize heap_budget, total_heaps_size;
uint64_t mem_available = 0;
if (device->memory.heaps[i].is_local_mem) {
total_heaps_size = total_vram_heaps_size;
if (device->vram_non_mappable.size > 0 && i == 0) {
mem_available = device->vram_non_mappable.available;
} else {
mem_available = device->vram_mappable.available;
}
} else {
total_heaps_size = total_sys_heaps_size;
mem_available = MIN2(device->sys.available, total_heaps_size);
}
double heap_proportion = (double) heap_size / total_heaps_size;
VkDeviceSize available_prop = mem_available * heap_proportion;
/*
* Let's not incite the app to starve the system: report at most 90% of
* the available heap memory.
*/
uint64_t heap_available = available_prop * 9 / 10;
heap_budget = MIN2(heap_size, heap_used + heap_available);
/*
* Round down to the nearest MB
*/
heap_budget &= ~((1ull << 20) - 1);
/*
* The heapBudget value must be non-zero for array elements less than
* VkPhysicalDeviceMemoryProperties::memoryHeapCount. The heapBudget
* value must be less than or equal to VkMemoryHeap::size for each heap.
*/
assert(0 < heap_budget && heap_budget <= heap_size);
memoryBudget->heapUsage[i] = heap_used;
memoryBudget->heapBudget[i] = heap_budget;
}
/* The heapBudget and heapUsage values must be zero for array elements
* greater than or equal to VkPhysicalDeviceMemoryProperties::memoryHeapCount
*/
for (uint32_t i = device->memory.heap_count; i < VK_MAX_MEMORY_HEAPS; i++) {
memoryBudget->heapBudget[i] = 0;
memoryBudget->heapUsage[i] = 0;
}
}
void anv_GetPhysicalDeviceMemoryProperties2(
VkPhysicalDevice physicalDevice,
VkPhysicalDeviceMemoryProperties2* pMemoryProperties)
{
anv_GetPhysicalDeviceMemoryProperties(physicalDevice,
&pMemoryProperties->memoryProperties);
vk_foreach_struct(ext, pMemoryProperties->pNext) {
switch (ext->sType) {
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT:
anv_get_memory_budget(physicalDevice, (void*)ext);
break;
default:
vk_debug_ignored_stype(ext->sType);
break;
}
}
}
PFN_vkVoidFunction anv_GetInstanceProcAddr(
VkInstance _instance,
const char* pName)
{
ANV_FROM_HANDLE(anv_instance, instance, _instance);
return vk_instance_get_proc_addr(&instance->vk,
&anv_instance_entrypoints,
pName);
}
/* With version 1+ of the loader interface the ICD should expose
* vk_icdGetInstanceProcAddr to work around certain LD_PRELOAD issues seen in apps.
*/
PUBLIC
VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(
VkInstance instance,
const char* pName)
{
return anv_GetInstanceProcAddr(instance, pName);
}
static void
anv_device_init_border_colors(struct anv_device *device)
{
static const struct gfx8_border_color border_colors[] = {
[VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 0.0 } },
[VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 1.0 } },
[VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] = { .float32 = { 1.0, 1.0, 1.0, 1.0 } },
[VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] = { .uint32 = { 0, 0, 0, 0 } },
[VK_BORDER_COLOR_INT_OPAQUE_BLACK] = { .uint32 = { 0, 0, 0, 1 } },
[VK_BORDER_COLOR_INT_OPAQUE_WHITE] = { .uint32 = { 1, 1, 1, 1 } },
};
device->border_colors =
anv_state_pool_emit_data(&device->dynamic_state_pool,
sizeof(border_colors), 64, border_colors);
if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
device->border_colors_db =
anv_state_pool_emit_data(&device->dynamic_state_db_pool,
sizeof(border_colors), 64, border_colors);
}
}
static VkResult
anv_device_init_trivial_batch(struct anv_device *device)
{
VkResult result = anv_device_alloc_bo(device, "trivial-batch", 4096,
ANV_BO_ALLOC_MAPPED |
ANV_BO_ALLOC_HOST_COHERENT |
ANV_BO_ALLOC_INTERNAL,
0 /* explicit_address */,
&device->trivial_batch_bo);
if (result != VK_SUCCESS)
return result;
struct anv_batch batch = {
.start = device->trivial_batch_bo->map,
.next = device->trivial_batch_bo->map,
.end = device->trivial_batch_bo->map + 4096,
};
anv_batch_emit(&batch, GFX7_MI_BATCH_BUFFER_END, bbe);
anv_batch_emit(&batch, GFX7_MI_NOOP, noop);
return VK_SUCCESS;
}
static bool
get_bo_from_pool(struct intel_batch_decode_bo *ret,
struct anv_block_pool *pool,
uint64_t address)
{
anv_block_pool_foreach_bo(bo, pool) {
uint64_t bo_address = intel_48b_address(bo->offset);
if (address >= bo_address && address < (bo_address + bo->size)) {
*ret = (struct intel_batch_decode_bo) {
.addr = bo_address,
.size = bo->size,
.map = bo->map,
};
return true;
}
}
return false;
}
/* Finding a buffer for batch decoding */
static struct intel_batch_decode_bo
decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
{
struct anv_device *device = v_batch;
struct intel_batch_decode_bo ret_bo = {};
assert(ppgtt);
if (get_bo_from_pool(&ret_bo, &device->dynamic_state_pool.block_pool, address))
return ret_bo;
if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
get_bo_from_pool(&ret_bo, &device->dynamic_state_db_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->instruction_state_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->binding_table_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->scratch_surface_state_pool.block_pool, address))
return ret_bo;
if (device->physical->indirect_descriptors &&
get_bo_from_pool(&ret_bo, &device->bindless_surface_state_pool.block_pool, address))
return ret_bo;
if (get_bo_from_pool(&ret_bo, &device->internal_surface_state_pool.block_pool, address))
return ret_bo;
if (device->physical->indirect_descriptors &&
get_bo_from_pool(&ret_bo, &device->indirect_push_descriptor_pool.block_pool, address))
return ret_bo;
if (device->info->has_aux_map &&
get_bo_from_pool(&ret_bo, &device->aux_tt_pool.block_pool, address))
return ret_bo;
if (!device->cmd_buffer_being_decoded)
return (struct intel_batch_decode_bo) { };
struct anv_batch_bo **bbo;
u_vector_foreach(bbo, &device->cmd_buffer_being_decoded->seen_bbos) {
/* The decoder zeroes out the top 16 bits, so we need to as well */
uint64_t bo_address = (*bbo)->bo->offset & (~0ull >> 16);
if (address >= bo_address && address < bo_address + (*bbo)->bo->size) {
return (struct intel_batch_decode_bo) {
.addr = bo_address,
.size = (*bbo)->bo->size,
.map = (*bbo)->bo->map,
};
}
uint32_t dep_words = (*bbo)->relocs.dep_words;
BITSET_WORD *deps = (*bbo)->relocs.deps;
for (uint32_t w = 0; w < dep_words; w++) {
BITSET_WORD mask = deps[w];
while (mask) {
int i = u_bit_scan(&mask);
uint32_t gem_handle = w * BITSET_WORDBITS + i;
struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
assert(bo->refcount > 0);
bo_address = bo->offset & (~0ull >> 16);
if (address >= bo_address && address < bo_address + bo->size) {
return (struct intel_batch_decode_bo) {
.addr = bo_address,
.size = bo->size,
.map = bo->map,
};
}
}
}
}
return (struct intel_batch_decode_bo) { };
}
struct intel_aux_map_buffer {
struct intel_buffer base;
struct anv_state state;
};
static struct intel_buffer *
intel_aux_map_buffer_alloc(void *driver_ctx, uint32_t size)
{
struct intel_aux_map_buffer *buf = malloc(sizeof(struct intel_aux_map_buffer));
if (!buf)
return NULL;
struct anv_device *device = (struct anv_device*)driver_ctx;
struct anv_state_pool *pool = &device->aux_tt_pool;
buf->state = anv_state_pool_alloc(pool, size, size);
buf->base.gpu = pool->block_pool.bo->offset + buf->state.offset;
buf->base.gpu_end = buf->base.gpu + buf->state.alloc_size;
buf->base.map = buf->state.map;
buf->base.driver_bo = &buf->state;
return &buf->base;
}
static void
intel_aux_map_buffer_free(void *driver_ctx, struct intel_buffer *buffer)
{
struct intel_aux_map_buffer *buf = (struct intel_aux_map_buffer*)buffer;
struct anv_device *device = (struct anv_device*)driver_ctx;
struct anv_state_pool *pool = &device->aux_tt_pool;
anv_state_pool_free(pool, buf->state);
free(buf);
}
static struct intel_mapped_pinned_buffer_alloc aux_map_allocator = {
.alloc = intel_aux_map_buffer_alloc,
.free = intel_aux_map_buffer_free,
};
static VkResult
anv_device_setup_context_or_vm(struct anv_device *device,
const VkDeviceCreateInfo *pCreateInfo,
const uint32_t num_queues)
{
switch (device->info->kmd_type) {
case INTEL_KMD_TYPE_I915:
return anv_i915_device_setup_context(device, pCreateInfo, num_queues);
case INTEL_KMD_TYPE_XE:
return anv_xe_device_setup_vm(device);
default:
unreachable("Missing");
return VK_ERROR_UNKNOWN;
}
}
static bool
anv_device_destroy_context_or_vm(struct anv_device *device)
{
switch (device->info->kmd_type) {
case INTEL_KMD_TYPE_I915:
if (device->physical->has_vm_control)
return anv_i915_device_destroy_vm(device);
else
return intel_gem_destroy_context(device->fd, device->context_id);
case INTEL_KMD_TYPE_XE:
return anv_xe_device_destroy_vm(device);
default:
unreachable("Missing");
return false;
}
}
static VkResult
anv_device_init_trtt(struct anv_device *device)
{
struct anv_trtt *trtt = &device->trtt;
if (pthread_mutex_init(&trtt->mutex, NULL) != 0)
return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
list_inithead(&trtt->in_flight_batches);
return VK_SUCCESS;
}
static void
anv_device_finish_trtt(struct anv_device *device)
{
struct anv_trtt *trtt = &device->trtt;
if (trtt->timeline_val > 0) {
struct drm_syncobj_timeline_wait wait = {
.handles = (uintptr_t)&trtt->timeline_handle,
.points = (uintptr_t)&trtt->timeline_val,
.timeout_nsec = INT64_MAX,
.count_handles = 1,
.flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
.first_signaled = false,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wait))
fprintf(stderr, "TR-TT syncobj wait failed!\n");
list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo,
&trtt->in_flight_batches, link)
anv_trtt_batch_bo_free(device, trtt_bbo);
}
if (trtt->timeline_handle > 0) {
struct drm_syncobj_destroy destroy = {
.handle = trtt->timeline_handle,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &destroy))
fprintf(stderr, "TR-TT syncobj destroy failed!\n");
}
pthread_mutex_destroy(&trtt->mutex);
vk_free(&device->vk.alloc, trtt->l3_mirror);
vk_free(&device->vk.alloc, trtt->l2_mirror);
for (int i = 0; i < trtt->num_page_table_bos; i++)
anv_device_release_bo(device, trtt->page_table_bos[i]);
vk_free(&device->vk.alloc, trtt->page_table_bos);
}
VkResult anv_CreateDevice(
VkPhysicalDevice physicalDevice,
const VkDeviceCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator,
VkDevice* pDevice)
{
ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
VkResult result;
struct anv_device *device;
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
/* Check requested queues and fail if we are requested to create any
* queues with flags we don't support.
*/
assert(pCreateInfo->queueCreateInfoCount > 0);
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);
}
device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
sizeof(*device), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!device)
return vk_error(physical_device, VK_ERROR_OUT_OF_HOST_MEMORY);
struct vk_device_dispatch_table dispatch_table;
bool override_initial_entrypoints = true;
if (physical_device->instance->vk.app_info.app_name &&
!strcmp(physical_device->instance->vk.app_info.app_name, "HITMAN3.exe")) {
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
&anv_hitman3_device_entrypoints,
true);
override_initial_entrypoints = false;
}
if (physical_device->info.ver < 12 &&
physical_device->instance->vk.app_info.app_name &&
!strcmp(physical_device->instance->vk.app_info.app_name, "DOOM 64")) {
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
&anv_doom64_device_entrypoints,
true);
override_initial_entrypoints = false;
}
#if DETECT_OS_ANDROID
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
&anv_android_device_entrypoints,
true);
override_initial_entrypoints = false;
#endif
if (physical_device->instance->vk.trace_mode & VK_TRACE_MODE_RMV) {
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
&anv_rmv_device_entrypoints,
true);
override_initial_entrypoints = false;
}
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
anv_genX(&physical_device->info, device_entrypoints),
override_initial_entrypoints);
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
&anv_device_entrypoints, false);
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
&wsi_device_entrypoints, false);
result = vk_device_init(&device->vk, &physical_device->vk,
&dispatch_table, pCreateInfo, pAllocator);
if (result != VK_SUCCESS)
goto fail_alloc;
if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS)) {
for (unsigned i = 0; i < physical_device->queue.family_count; i++) {
struct intel_batch_decode_ctx *decoder = &device->decoder[i];
const unsigned decode_flags = INTEL_BATCH_DECODE_DEFAULT_FLAGS;
intel_batch_decode_ctx_init_brw(decoder,
&physical_device->compiler->isa,
&physical_device->info,
stderr, decode_flags, NULL,
decode_get_bo, NULL, device);
intel_batch_stats_reset(decoder);
decoder->engine = physical_device->queue.families[i].engine_class;
decoder->dynamic_base = physical_device->va.dynamic_state_pool.addr;
decoder->surface_base = physical_device->va.internal_surface_state_pool.addr;
decoder->instruction_base = physical_device->va.instruction_state_pool.addr;
}
}
anv_device_set_physical(device, physical_device);
device->kmd_backend = anv_kmd_backend_get(device->info->kmd_type);
/* XXX(chadv): Can we dup() physicalDevice->fd here? */
device->fd = open(physical_device->path, O_RDWR | O_CLOEXEC);
if (device->fd == -1) {
result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_device;
}
switch (device->info->kmd_type) {
case INTEL_KMD_TYPE_I915:
device->vk.check_status = anv_i915_device_check_status;
break;
case INTEL_KMD_TYPE_XE:
device->vk.check_status = anv_xe_device_check_status;
break;
default:
unreachable("Missing");
}
device->vk.command_buffer_ops = &anv_cmd_buffer_ops;
device->vk.create_sync_for_memory = anv_create_sync_for_memory;
if (physical_device->info.kmd_type == INTEL_KMD_TYPE_I915)
device->vk.create_sync_for_memory = anv_create_sync_for_memory;
vk_device_set_drm_fd(&device->vk, device->fd);
uint32_t num_queues = 0;
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
num_queues += pCreateInfo->pQueueCreateInfos[i].queueCount;
result = anv_device_setup_context_or_vm(device, pCreateInfo, num_queues);
if (result != VK_SUCCESS)
goto fail_fd;
device->queues =
vk_zalloc(&device->vk.alloc, num_queues * sizeof(*device->queues), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (device->queues == NULL) {
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_context_id;
}
device->queue_count = 0;
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
const VkDeviceQueueCreateInfo *queueCreateInfo =
&pCreateInfo->pQueueCreateInfos[i];
for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++) {
result = anv_queue_init(device, &device->queues[device->queue_count],
queueCreateInfo, j);
if (result != VK_SUCCESS)
goto fail_queues;
device->queue_count++;
}
}
if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) {
result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_queues;
}
/* keep the page with address zero out of the allocator */
util_vma_heap_init(&device->vma_lo,
device->physical->va.low_heap.addr,
device->physical->va.low_heap.size);
util_vma_heap_init(&device->vma_hi,
device->physical->va.high_heap.addr,
device->physical->va.high_heap.size);
if (device->physical->indirect_descriptors) {
util_vma_heap_init(&device->vma_desc,
device->physical->va.indirect_descriptor_pool.addr,
device->physical->va.indirect_descriptor_pool.size);
} else {
util_vma_heap_init(&device->vma_desc,
device->physical->va.bindless_surface_state_pool.addr,
device->physical->va.bindless_surface_state_pool.size);
}
/* Always initialized because the the memory types point to this and they
* are on the physical device.
*/
util_vma_heap_init(&device->vma_desc_buf,
device->physical->va.descriptor_buffer_pool.addr,
device->physical->va.descriptor_buffer_pool.size);
util_vma_heap_init(&device->vma_samplers,
device->physical->va.sampler_state_pool.addr,
device->physical->va.sampler_state_pool.size);
util_vma_heap_init(&device->vma_trtt,
device->physical->va.trtt.addr,
device->physical->va.trtt.size);
list_inithead(&device->memory_objects);
list_inithead(&device->image_private_objects);
if (pthread_mutex_init(&device->mutex, NULL) != 0) {
result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_vmas;
}
pthread_condattr_t condattr;
if (pthread_condattr_init(&condattr) != 0) {
result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_mutex;
}
if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC) != 0) {
pthread_condattr_destroy(&condattr);
result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_mutex;
}
if (pthread_cond_init(&device->queue_submit, &condattr) != 0) {
pthread_condattr_destroy(&condattr);
result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
goto fail_mutex;
}
pthread_condattr_destroy(&condattr);
if (physical_device->instance->vk.trace_mode & VK_TRACE_MODE_RMV)
anv_memory_trace_init(device);
result = anv_bo_cache_init(&device->bo_cache, device);
if (result != VK_SUCCESS)
goto fail_queue_cond;
anv_bo_pool_init(&device->batch_bo_pool, device, "batch",
ANV_BO_ALLOC_MAPPED |
ANV_BO_ALLOC_HOST_CACHED_COHERENT |
ANV_BO_ALLOC_CAPTURE);
if (device->vk.enabled_extensions.KHR_acceleration_structure) {
anv_bo_pool_init(&device->bvh_bo_pool, device, "bvh build",
0 /* alloc_flags */);
}
/* Because scratch is also relative to General State Base Address, we leave
* the base address 0 and start the pool memory at an offset. This way we
* get the correct offsets in the anv_states that get allocated from it.
*/
result = anv_state_pool_init(&device->general_state_pool, device,
&(struct anv_state_pool_params) {
.name = "general pool",
.base_address = 0,
.start_offset = device->physical->va.general_state_pool.addr,
.block_size = 16384,
.max_size = device->physical->va.general_state_pool.size
});
if (result != VK_SUCCESS)
goto fail_batch_bo_pool;
result = anv_state_pool_init(&device->dynamic_state_pool, device,
&(struct anv_state_pool_params) {
.name = "dynamic pool",
.base_address = device->physical->va.dynamic_state_pool.addr,
.block_size = 16384,
.max_size = device->physical->va.dynamic_state_pool.size,
});
if (result != VK_SUCCESS)
goto fail_general_state_pool;
if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
result = anv_state_pool_init(&device->dynamic_state_db_pool, device,
&(struct anv_state_pool_params) {
.name = "dynamic pool (db)",
.base_address = device->physical->va.dynamic_state_db_pool.addr,
.block_size = 16384,
.max_size = device->physical->va.dynamic_state_db_pool.size,
});
if (result != VK_SUCCESS)
goto fail_dynamic_state_pool;
}
/* The border color pointer is limited to 24 bits, so we need to make
* sure that any such color used at any point in the program doesn't
* exceed that limit.
* We achieve that by reserving all the custom border colors we support
* right off the bat, so they are close to the base address.
*/
anv_state_reserved_pool_init(&device->custom_border_colors,
&device->dynamic_state_pool,
MAX_CUSTOM_BORDER_COLORS,
sizeof(struct gfx8_border_color), 64);
if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
result = anv_state_reserved_array_pool_init(&device->custom_border_colors_db,
&device->dynamic_state_db_pool,
MAX_CUSTOM_BORDER_COLORS,
sizeof(struct gfx8_border_color), 64);
if (result != VK_SUCCESS)
goto fail_dynamic_state_db_pool;
}
result = anv_state_pool_init(&device->instruction_state_pool, device,
&(struct anv_state_pool_params) {
.name = "instruction pool",
.base_address = device->physical->va.instruction_state_pool.addr,
.block_size = 16384,
.max_size = device->physical->va.instruction_state_pool.size,
});
if (result != VK_SUCCESS)
goto fail_reserved_array_pool;
if (device->info->verx10 >= 125) {
/* Put the scratch surface states at the beginning of the internal
* surface state pool.
*/
result = anv_state_pool_init(&device->scratch_surface_state_pool, device,
&(struct anv_state_pool_params) {
.name = "scratch surface state pool",
.base_address = device->physical->va.scratch_surface_state_pool.addr,
.block_size = 4096,
.max_size = device->physical->va.scratch_surface_state_pool.size,
});
if (result != VK_SUCCESS)
goto fail_instruction_state_pool;
result = anv_state_pool_init(&device->internal_surface_state_pool, device,
&(struct anv_state_pool_params) {
.name = "internal surface state pool",
.base_address = device->physical->va.internal_surface_state_pool.addr,
.start_offset = device->physical->va.scratch_surface_state_pool.size,
.block_size = 4096,
.max_size = device->physical->va.internal_surface_state_pool.size,
});
} else {
result = anv_state_pool_init(&device->internal_surface_state_pool, device,
&(struct anv_state_pool_params) {
.name = "internal surface state pool",
.base_address = device->physical->va.internal_surface_state_pool.addr,
.block_size = 4096,
.max_size = device->physical->va.internal_surface_state_pool.size,
});
}
if (result != VK_SUCCESS)
goto fail_scratch_surface_state_pool;
if (device->physical->indirect_descriptors) {
result = anv_state_pool_init(&device->bindless_surface_state_pool, device,
&(struct anv_state_pool_params) {
.name = "bindless surface state pool",
.base_address = device->physical->va.bindless_surface_state_pool.addr,
.block_size = 4096,
.max_size = device->physical->va.bindless_surface_state_pool.size,
});
if (result != VK_SUCCESS)
goto fail_internal_surface_state_pool;
}
if (device->info->verx10 >= 125) {
/* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to give the binding
* table its own base address separately from surface state base.
*/
result = anv_state_pool_init(&device->binding_table_pool, device,
&(struct anv_state_pool_params) {
.name = "binding table pool",
.base_address = device->physical->va.binding_table_pool.addr,
.block_size = BINDING_TABLE_POOL_BLOCK_SIZE,
.max_size = device->physical->va.binding_table_pool.size,
});
} else {
/* The binding table should be in front of the surface states in virtual
* address space so that all surface states can be express as relative
* offsets from the binding table location.
*/
assert(device->physical->va.binding_table_pool.addr <
device->physical->va.internal_surface_state_pool.addr);
int64_t bt_pool_offset = (int64_t)device->physical->va.binding_table_pool.addr -
(int64_t)device->physical->va.internal_surface_state_pool.addr;
assert(INT32_MIN < bt_pool_offset && bt_pool_offset < 0);
result = anv_state_pool_init(&device->binding_table_pool, device,
&(struct anv_state_pool_params) {
.name = "binding table pool",
.base_address = device->physical->va.internal_surface_state_pool.addr,
.start_offset = bt_pool_offset,
.block_size = BINDING_TABLE_POOL_BLOCK_SIZE,
.max_size = device->physical->va.internal_surface_state_pool.size,
});
}
if (result != VK_SUCCESS)
goto fail_bindless_surface_state_pool;
if (device->physical->indirect_descriptors) {
result = anv_state_pool_init(&device->indirect_push_descriptor_pool, device,
&(struct anv_state_pool_params) {
.name = "indirect push descriptor pool",
.base_address = device->physical->va.indirect_push_descriptor_pool.addr,
.block_size = 4096,
.max_size = device->physical->va.indirect_push_descriptor_pool.size,
});
if (result != VK_SUCCESS)
goto fail_binding_table_pool;
}
if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
device->info->verx10 >= 125) {
/* On Gfx12.5+ because of the bindless stages (Mesh, Task, RT), the only
* way we can wire push descriptors is through the bindless heap. This
* state pool is a 1Gb carve out of the 4Gb HW heap.
*/
result = anv_state_pool_init(&device->push_descriptor_buffer_pool, device,
&(struct anv_state_pool_params) {
.name = "push descriptor buffer state pool",
.base_address = device->physical->va.push_descriptor_buffer_pool.addr,
.block_size = 4096,
.max_size = device->physical->va.push_descriptor_buffer_pool.size,
});
if (result != VK_SUCCESS)
goto fail_indirect_push_descriptor_pool;
}
if (device->info->has_aux_map) {
result = anv_state_pool_init(&device->aux_tt_pool, device,
&(struct anv_state_pool_params) {
.name = "aux-tt pool",
.base_address = device->physical->va.aux_tt_pool.addr,
.block_size = 16384,
.max_size = device->physical->va.aux_tt_pool.size,
});
if (result != VK_SUCCESS)
goto fail_push_descriptor_buffer_pool;
device->aux_map_ctx = intel_aux_map_init(device, &aux_map_allocator,
&physical_device->info);
if (!device->aux_map_ctx)
goto fail_aux_tt_pool;
}
result = anv_device_alloc_bo(device, "workaround", 8192,
ANV_BO_ALLOC_CAPTURE |
ANV_BO_ALLOC_HOST_COHERENT |
ANV_BO_ALLOC_MAPPED |
ANV_BO_ALLOC_INTERNAL,
0 /* explicit_address */,
&device->workaround_bo);
if (result != VK_SUCCESS)
goto fail_surface_aux_map_pool;
device->workaround_address = (struct anv_address) {
.bo = device->workaround_bo,
.offset = align(intel_debug_write_identifiers(device->workaround_bo->map,
device->workaround_bo->size,
"Anv"), 32),
};
device->workarounds.doom64_images = NULL;
device->rt_uuid_addr = anv_address_add(device->workaround_address, 8);
memcpy(device->rt_uuid_addr.bo->map + device->rt_uuid_addr.offset,
physical_device->rt_uuid,
sizeof(physical_device->rt_uuid));
device->debug_frame_desc =
intel_debug_get_identifier_block(device->workaround_bo->map,
device->workaround_bo->size,
INTEL_DEBUG_BLOCK_TYPE_FRAME);
if (device->vk.enabled_extensions.KHR_ray_query) {
uint32_t ray_queries_size =
align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
result = anv_device_alloc_bo(device, "ray queries",
ray_queries_size,
ANV_BO_ALLOC_INTERNAL,
0 /* explicit_address */,
&device->ray_query_bo);
if (result != VK_SUCCESS)
goto fail_workaround_bo;
}
result = anv_device_init_trivial_batch(device);
if (result != VK_SUCCESS)
goto fail_ray_query_bo;
/* Emit the CPS states before running the initialization batch as those
* structures are referenced.
*/
if (device->info->ver >= 12) {
uint32_t n_cps_states = 3 * 3; /* All combinaisons of X by Y CP sizes (1, 2, 4) */
if (device->info->has_coarse_pixel_primitive_and_cb)
n_cps_states *= 5 * 5; /* 5 combiners by 2 operators */
n_cps_states += 1; /* Disable CPS */
/* Each of the combinaison must be replicated on all viewports */
n_cps_states *= MAX_VIEWPORTS;
device->cps_states =
anv_state_pool_alloc(&device->dynamic_state_pool,
n_cps_states * CPS_STATE_length(device->info) * 4,
32);
if (device->cps_states.map == NULL)
goto fail_trivial_batch;
anv_genX(device->info, init_cps_device_state)(device);
if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
device->cps_states_db =
anv_state_pool_alloc(&device->dynamic_state_db_pool,
device->cps_states.alloc_size, 32);
if (device->cps_states_db.map == NULL)
goto fail_trivial_batch;
memcpy(device->cps_states_db.map, device->cps_states.map,
device->cps_states.alloc_size);
}
}
if (device->physical->indirect_descriptors) {
/* Allocate a null surface state at surface state offset 0. This makes
* NULL descriptor handling trivial because we can just memset
* structures to zero and they have a valid descriptor.
*/
device->null_surface_state =
anv_state_pool_alloc(&device->bindless_surface_state_pool,
device->isl_dev.ss.size,
device->isl_dev.ss.align);
isl_null_fill_state(&device->isl_dev, device->null_surface_state.map,
.size = isl_extent3d(1, 1, 1) /* This shouldn't matter */);
assert(device->null_surface_state.offset == 0);
} else {
/* When using direct descriptors, those can hold the null surface state
* directly. We still need a null surface for the binding table entries
* though but this one can live anywhere the internal surface state
* pool.
*/
device->null_surface_state =
anv_state_pool_alloc(&device->internal_surface_state_pool,
device->isl_dev.ss.size,
device->isl_dev.ss.align);
isl_null_fill_state(&device->isl_dev, device->null_surface_state.map,
.size = isl_extent3d(1, 1, 1) /* This shouldn't matter */);
}
isl_null_fill_state(&device->isl_dev, &device->host_null_surface_state,
.size = isl_extent3d(1, 1, 1) /* This shouldn't matter */);
anv_scratch_pool_init(device, &device->scratch_pool);
/* TODO(RT): Do we want some sort of data structure for this? */
memset(device->rt_scratch_bos, 0, sizeof(device->rt_scratch_bos));
if (ANV_SUPPORT_RT && device->info->has_ray_tracing) {
/* The docs say to always allocate 128KB per DSS */
const uint32_t btd_fifo_bo_size =
128 * 1024 * intel_device_info_dual_subslice_id_bound(device->info);
result = anv_device_alloc_bo(device,
"rt-btd-fifo",
btd_fifo_bo_size,
ANV_BO_ALLOC_INTERNAL,
0 /* explicit_address */,
&device->btd_fifo_bo);
if (result != VK_SUCCESS)
goto fail_trivial_batch_bo_and_scratch_pool;
}
result = anv_device_init_trtt(device);
if (result != VK_SUCCESS)
goto fail_btd_fifo_bo;
struct vk_pipeline_cache_create_info pcc_info = { .weak_ref = true, };
device->vk.mem_cache =
vk_pipeline_cache_create(&device->vk, &pcc_info, NULL);
if (!device->vk.mem_cache) {
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_trtt;
}
/* Internal shaders need their own pipeline cache because, unlike the rest
* of ANV, it won't work at all without the cache. It depends on it for
* shaders to remain resident while it runs. Therefore, we need a special
* cache just for BLORP/RT that's forced to always be enabled.
*/
struct vk_pipeline_cache_create_info internal_pcc_info = {
.force_enable = true,
.weak_ref = false,
};
device->internal_cache =
vk_pipeline_cache_create(&device->vk, &internal_pcc_info, NULL);
if (device->internal_cache == NULL) {
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_default_pipeline_cache;
}
/* The device (currently is ICL/TGL) does not have float64 support. */
if (!device->info->has_64bit_float &&
device->physical->instance->fp64_workaround_enabled)
anv_load_fp64_shader(device);
if (INTEL_DEBUG(DEBUG_SHADER_PRINT)) {
result = anv_device_print_init(device);
if (result != VK_SUCCESS)
goto fail_internal_cache;
}
result = anv_device_init_rt_shaders(device);
if (result != VK_SUCCESS) {
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
goto fail_print;
}
#if DETECT_OS_ANDROID
device->u_gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
#endif
device->robust_buffer_access =
device->vk.enabled_features.robustBufferAccess ||
device->vk.enabled_features.nullDescriptor;
device->breakpoint = anv_state_pool_alloc(&device->dynamic_state_pool, 4,
4);
p_atomic_set(&device->draw_call_count, 0);
/* Create a separate command pool for companion RCS command buffer. */
if (device->info->verx10 >= 125) {
VkCommandPoolCreateInfo pool_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.queueFamilyIndex =
anv_get_first_render_queue_index(device->physical),
};
result = vk_common_CreateCommandPool(anv_device_to_handle(device),
&pool_info, NULL,
&device->companion_rcs_cmd_pool);
if (result != VK_SUCCESS) {
goto fail_internal_cache;
}
}
anv_device_init_blorp(device);
anv_device_init_border_colors(device);
anv_device_init_internal_kernels(device);
anv_device_init_astc_emu(device);
anv_device_perf_init(device);
anv_device_utrace_init(device);
anv_device_init_embedded_samplers(device);
BITSET_ONES(device->gfx_dirty_state);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_INDEX_BUFFER);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SO_DECL_LIST);
if (device->info->ver < 11)
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_VF_SGVS_2);
if (device->info->ver < 12) {
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_DEPTH_BOUNDS);
}
if (!device->vk.enabled_extensions.EXT_sample_locations)
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SAMPLE_PATTERN);
if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_CPS);
if (!device->vk.enabled_extensions.EXT_mesh_shader) {
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SBE_MESH);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_CLIP_MESH);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_CONTROL);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_SHADER);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_DISTRIB);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_CONTROL);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_SHADER);
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_REDISTRIB);
}
if (!intel_needs_workaround(device->info, 18019816803))
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_WA_18019816803);
if (device->info->ver > 9)
BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_PMA_FIX);
result = anv_genX(device->info, init_device_state)(device);
if (result != VK_SUCCESS)
goto fail_companion_cmd_pool;
*pDevice = anv_device_to_handle(device);
return VK_SUCCESS;
fail_companion_cmd_pool:
anv_device_finish_embedded_samplers(device);
anv_device_utrace_finish(device);
anv_device_finish_blorp(device);
anv_device_finish_rt_shaders(device);
anv_device_finish_astc_emu(device);
anv_device_finish_internal_kernels(device);
if (device->info->verx10 >= 125) {
vk_common_DestroyCommandPool(anv_device_to_handle(device),
device->companion_rcs_cmd_pool, NULL);
}
fail_print:
if (INTEL_DEBUG(DEBUG_SHADER_PRINT))
anv_device_print_fini(device);
fail_internal_cache:
vk_pipeline_cache_destroy(device->internal_cache, NULL);
fail_default_pipeline_cache:
vk_pipeline_cache_destroy(device->vk.mem_cache, NULL);
fail_trtt:
anv_device_finish_trtt(device);
fail_btd_fifo_bo:
if (ANV_SUPPORT_RT && device->info->has_ray_tracing)
anv_device_release_bo(device, device->btd_fifo_bo);
fail_trivial_batch_bo_and_scratch_pool:
anv_scratch_pool_finish(device, &device->scratch_pool);
fail_trivial_batch:
anv_device_release_bo(device, device->trivial_batch_bo);
fail_ray_query_bo:
if (device->ray_query_bo)
anv_device_release_bo(device, device->ray_query_bo);
fail_workaround_bo:
anv_device_release_bo(device, device->workaround_bo);
fail_surface_aux_map_pool:
if (device->info->has_aux_map) {
intel_aux_map_finish(device->aux_map_ctx);
device->aux_map_ctx = NULL;
}
fail_aux_tt_pool:
if (device->info->has_aux_map)
anv_state_pool_finish(&device->aux_tt_pool);
fail_push_descriptor_buffer_pool:
if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
device->info->verx10 >= 125)
anv_state_pool_finish(&device->push_descriptor_buffer_pool);
fail_indirect_push_descriptor_pool:
if (device->physical->indirect_descriptors)
anv_state_pool_finish(&device->indirect_push_descriptor_pool);
fail_binding_table_pool:
anv_state_pool_finish(&device->binding_table_pool);
fail_bindless_surface_state_pool:
if (device->physical->indirect_descriptors)
anv_state_pool_finish(&device->bindless_surface_state_pool);
fail_internal_surface_state_pool:
anv_state_pool_finish(&device->internal_surface_state_pool);
fail_scratch_surface_state_pool:
if (device->info->verx10 >= 125)
anv_state_pool_finish(&device->scratch_surface_state_pool);
fail_instruction_state_pool:
anv_state_pool_finish(&device->instruction_state_pool);
fail_reserved_array_pool:
if (device->vk.enabled_extensions.EXT_descriptor_buffer)
anv_state_reserved_array_pool_finish(&device->custom_border_colors_db);
fail_dynamic_state_db_pool:
anv_state_reserved_pool_finish(&device->custom_border_colors);
if (device->vk.enabled_extensions.EXT_descriptor_buffer)
anv_state_pool_finish(&device->dynamic_state_db_pool);
fail_dynamic_state_pool:
anv_state_pool_finish(&device->dynamic_state_pool);
fail_general_state_pool:
anv_state_pool_finish(&device->general_state_pool);
fail_batch_bo_pool:
if (device->vk.enabled_extensions.KHR_acceleration_structure)
anv_bo_pool_finish(&device->bvh_bo_pool);
anv_bo_pool_finish(&device->batch_bo_pool);
anv_bo_cache_finish(&device->bo_cache);
fail_queue_cond:
pthread_cond_destroy(&device->queue_submit);
fail_mutex:
pthread_mutex_destroy(&device->mutex);
fail_vmas:
util_vma_heap_finish(&device->vma_trtt);
util_vma_heap_finish(&device->vma_samplers);
util_vma_heap_finish(&device->vma_desc_buf);
util_vma_heap_finish(&device->vma_desc);
util_vma_heap_finish(&device->vma_hi);
util_vma_heap_finish(&device->vma_lo);
pthread_mutex_destroy(&device->vma_mutex);
fail_queues:
for (uint32_t i = 0; i < device->queue_count; i++)
anv_queue_finish(&device->queues[i]);
vk_free(&device->vk.alloc, device->queues);
fail_context_id:
anv_device_destroy_context_or_vm(device);
fail_fd:
close(device->fd);
fail_device:
vk_device_finish(&device->vk);
fail_alloc:
vk_free(&device->vk.alloc, device);
return result;
}
void anv_DestroyDevice(
VkDevice _device,
const VkAllocationCallbacks* pAllocator)
{
ANV_FROM_HANDLE(anv_device, device, _device);
if (!device)
return;
#if DETECT_OS_ANDROID
u_gralloc_destroy(&device->u_gralloc);
#endif
anv_memory_trace_finish(device);
struct anv_physical_device *pdevice = device->physical;
for (uint32_t i = 0; i < device->queue_count; i++)
anv_queue_finish(&device->queues[i]);
vk_free(&device->vk.alloc, device->queues);
anv_device_utrace_finish(device);
anv_device_finish_blorp(device);
anv_device_finish_rt_shaders(device);
anv_device_finish_astc_emu(device);
anv_device_finish_internal_kernels(device);
if (INTEL_DEBUG(DEBUG_SHADER_PRINT))
anv_device_print_fini(device);
vk_pipeline_cache_destroy(device->internal_cache, NULL);
vk_pipeline_cache_destroy(device->vk.mem_cache, NULL);
anv_device_finish_embedded_samplers(device);
anv_device_finish_trtt(device);
if (ANV_SUPPORT_RT && device->info->has_ray_tracing)
anv_device_release_bo(device, device->btd_fifo_bo);
if (device->info->verx10 >= 125) {
vk_common_DestroyCommandPool(anv_device_to_handle(device),
device->companion_rcs_cmd_pool, NULL);
}
if (device->vk.enabled_extensions.EXT_descriptor_buffer)
anv_state_reserved_array_pool_finish(&device->custom_border_colors_db);
#ifdef HAVE_VALGRIND
/* We only need to free these to prevent valgrind errors. The backing
* BO will go away in a couple of lines so we don't actually leak.
*/
anv_state_reserved_pool_finish(&device->custom_border_colors);
anv_state_pool_free(&device->dynamic_state_pool, device->border_colors);
anv_state_pool_free(&device->dynamic_state_pool, device->slice_hash);
anv_state_pool_free(&device->dynamic_state_pool, device->cps_states);
anv_state_pool_free(&device->dynamic_state_pool, device->breakpoint);
if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
anv_state_pool_free(&device->dynamic_state_db_pool, device->cps_states_db);
anv_state_pool_free(&device->dynamic_state_db_pool, device->slice_hash_db);
anv_state_pool_free(&device->dynamic_state_db_pool, device->border_colors_db);
}
#endif
for (unsigned i = 0; i < ARRAY_SIZE(device->rt_scratch_bos); i++) {
if (device->rt_scratch_bos[i] != NULL)
anv_device_release_bo(device, device->rt_scratch_bos[i]);
}
anv_scratch_pool_finish(device, &device->scratch_pool);
if (device->vk.enabled_extensions.KHR_ray_query) {
for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_shadow_bos); i++) {
if (device->ray_query_shadow_bos[i] != NULL)
anv_device_release_bo(device, device->ray_query_shadow_bos[i]);
}
anv_device_release_bo(device, device->ray_query_bo);
}
anv_device_release_bo(device, device->workaround_bo);
anv_device_release_bo(device, device->trivial_batch_bo);
if (device->info->has_aux_map) {
intel_aux_map_finish(device->aux_map_ctx);
device->aux_map_ctx = NULL;
anv_state_pool_finish(&device->aux_tt_pool);
}
if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
device->info->verx10 >= 125)
anv_state_pool_finish(&device->push_descriptor_buffer_pool);
if (device->physical->indirect_descriptors)
anv_state_pool_finish(&device->indirect_push_descriptor_pool);
anv_state_pool_finish(&device->binding_table_pool);
if (device->info->verx10 >= 125)
anv_state_pool_finish(&device->scratch_surface_state_pool);
anv_state_pool_finish(&device->internal_surface_state_pool);
if (device->physical->indirect_descriptors)
anv_state_pool_finish(&device->bindless_surface_state_pool);
anv_state_pool_finish(&device->instruction_state_pool);
if (device->vk.enabled_extensions.EXT_descriptor_buffer)
anv_state_pool_finish(&device->dynamic_state_db_pool);
anv_state_pool_finish(&device->dynamic_state_pool);
anv_state_pool_finish(&device->general_state_pool);
if (device->vk.enabled_extensions.KHR_acceleration_structure)
anv_bo_pool_finish(&device->bvh_bo_pool);
anv_bo_pool_finish(&device->batch_bo_pool);
anv_bo_cache_finish(&device->bo_cache);
util_vma_heap_finish(&device->vma_trtt);
util_vma_heap_finish(&device->vma_samplers);
util_vma_heap_finish(&device->vma_desc_buf);
util_vma_heap_finish(&device->vma_desc);
util_vma_heap_finish(&device->vma_hi);
util_vma_heap_finish(&device->vma_lo);
pthread_mutex_destroy(&device->vma_mutex);
pthread_cond_destroy(&device->queue_submit);
pthread_mutex_destroy(&device->mutex);
ralloc_free(device->fp64_nir);
anv_device_destroy_context_or_vm(device);
if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS)) {
for (unsigned i = 0; i < pdevice->queue.family_count; i++) {
if (INTEL_DEBUG(DEBUG_BATCH_STATS))
intel_batch_print_stats(&device->decoder[i]);
intel_batch_decode_ctx_finish(&device->decoder[i]);
}
}
close(device->fd);
vk_device_finish(&device->vk);
vk_free(&device->vk.alloc, device);
}
VkResult anv_EnumerateInstanceLayerProperties(
uint32_t* pPropertyCount,
VkLayerProperties* pProperties)
{
if (pProperties == NULL) {
*pPropertyCount = 0;
return VK_SUCCESS;
}
/* None supported at this time */
return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
}
VkResult
anv_device_wait(struct anv_device *device, struct anv_bo *bo,
int64_t timeout)
{
int ret = anv_gem_wait(device, bo->gem_handle, &timeout);
if (ret == -1 && errno == ETIME) {
return VK_TIMEOUT;
} else if (ret == -1) {
/* We don't know the real error. */
return vk_device_set_lost(&device->vk, "gem wait failed: %m");
} else {
return VK_SUCCESS;
}
}
static struct util_vma_heap *
anv_vma_heap_for_flags(struct anv_device *device,
enum anv_bo_alloc_flags alloc_flags)
{
if (alloc_flags & ANV_BO_ALLOC_TRTT)
return &device->vma_trtt;
if (alloc_flags & ANV_BO_ALLOC_DESCRIPTOR_BUFFER_POOL)
return &device->vma_desc_buf;
if (alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS)
return &device->vma_lo;
if (alloc_flags & ANV_BO_ALLOC_DESCRIPTOR_POOL)
return &device->vma_desc;
if (alloc_flags & ANV_BO_ALLOC_SAMPLER_POOL)
return &device->vma_samplers;
return &device->vma_hi;
}
uint64_t
anv_vma_alloc(struct anv_device *device,
uint64_t size, uint64_t align,
enum anv_bo_alloc_flags alloc_flags,
uint64_t client_address,
struct util_vma_heap **out_vma_heap)
{
pthread_mutex_lock(&device->vma_mutex);
uint64_t addr = 0;
*out_vma_heap = anv_vma_heap_for_flags(device, alloc_flags);
if (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) {
assert(*out_vma_heap == &device->vma_hi ||
*out_vma_heap == &device->vma_desc_buf ||
*out_vma_heap == &device->vma_trtt);
if (client_address) {
if (util_vma_heap_alloc_addr(*out_vma_heap,
client_address, size)) {
addr = client_address;
}
} else {
(*out_vma_heap)->alloc_high = false;
addr = util_vma_heap_alloc(*out_vma_heap, size, align);
(*out_vma_heap)->alloc_high = true;
}
/* We don't want to fall back to other heaps */
goto done;
}
assert(client_address == 0);
addr = util_vma_heap_alloc(*out_vma_heap, size, align);
done:
pthread_mutex_unlock(&device->vma_mutex);
assert(addr == intel_48b_address(addr));
return intel_canonical_address(addr);
}
void
anv_vma_free(struct anv_device *device,
struct util_vma_heap *vma_heap,
uint64_t address, uint64_t size)
{
assert(vma_heap == &device->vma_lo ||
vma_heap == &device->vma_hi ||
vma_heap == &device->vma_desc ||
vma_heap == &device->vma_desc_buf ||
vma_heap == &device->vma_samplers ||
vma_heap == &device->vma_trtt);
const uint64_t addr_48b = intel_48b_address(address);
pthread_mutex_lock(&device->vma_mutex);
util_vma_heap_free(vma_heap, addr_48b, size);
pthread_mutex_unlock(&device->vma_mutex);
}
VkResult anv_AllocateMemory(
VkDevice _device,
const VkMemoryAllocateInfo* pAllocateInfo,
const VkAllocationCallbacks* pAllocator,
VkDeviceMemory* pMem)
{
ANV_FROM_HANDLE(anv_device, device, _device);
struct anv_physical_device *pdevice = device->physical;
struct anv_device_memory *mem;
VkResult result = VK_SUCCESS;
assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);
VkDeviceSize aligned_alloc_size =
align64(pAllocateInfo->allocationSize, 4096);
assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.type_count);
const struct anv_memory_type *mem_type =
&pdevice->memory.types[pAllocateInfo->memoryTypeIndex];
assert(mem_type->heapIndex < pdevice->memory.heap_count);
struct anv_memory_heap *mem_heap =
&pdevice->memory.heaps[mem_type->heapIndex];
if (aligned_alloc_size > mem_heap->size)
return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
uint64_t mem_heap_used = p_atomic_read(&mem_heap->used);
if (mem_heap_used + aligned_alloc_size > mem_heap->size)
return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
mem = vk_device_memory_create(&device->vk, pAllocateInfo,
pAllocator, sizeof(*mem));
if (mem == NULL)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
mem->type = mem_type;
mem->map = NULL;
mem->map_size = 0;
mem->map_delta = 0;
enum anv_bo_alloc_flags alloc_flags = 0;
const VkImportMemoryFdInfoKHR *fd_info = NULL;
const VkMemoryDedicatedAllocateInfo *dedicated_info = NULL;
const struct wsi_memory_allocate_info *wsi_info = NULL;
uint64_t client_address = 0;
vk_foreach_struct_const(ext, pAllocateInfo->pNext) {
/* VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA isn't a real enum
* value, so use cast to avoid compiler warn
*/
switch ((uint32_t)ext->sType) {
case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO:
case VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID:
case VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT:
case VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR:
case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO:
/* handled by vk_device_memory_create */
break;
case VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR:
fd_info = (void *)ext;
break;
case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO:
dedicated_info = (void *)ext;
break;
case VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO: {
const VkMemoryOpaqueCaptureAddressAllocateInfo *addr_info =
(const VkMemoryOpaqueCaptureAddressAllocateInfo *)ext;
client_address = addr_info->opaqueCaptureAddress;
break;
}
case VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA:
wsi_info = (void *)ext;
break;
default:
vk_debug_ignored_stype(ext->sType);
break;
}
}
/* If i915 reported a mappable/non_mappable vram regions and the
* application want lmem mappable, then we need to use the
* I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS flag to create our BO.
*/
if (pdevice->vram_mappable.size > 0 &&
pdevice->vram_non_mappable.size > 0 &&
(mem_type->propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) &&
(mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT))
alloc_flags |= ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE;
if (!mem_heap->is_local_mem)
alloc_flags |= ANV_BO_ALLOC_NO_LOCAL_MEM;
if (mem->vk.alloc_flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT)
alloc_flags |= ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_PROTECTED_BIT)
alloc_flags |= ANV_BO_ALLOC_PROTECTED;
/* For now, always allocated AUX-TT aligned memory, regardless of dedicated
* allocations. An application can for example, suballocate a large
* VkDeviceMemory and try to bind an image created with a CCS modifier. In
* that case we cannot disable CCS if the alignment doesn´t meet the AUX-TT
* requirements, so we need to ensure both the VkDeviceMemory and the
* alignment reported through vkGetImageMemoryRequirements() meet the
* AUX-TT requirement.
*
* TODO: when we enable EXT_descriptor_buffer, we'll be able to drop the
* AUX-TT alignment for that type of allocation.
*/
if (device->info->has_aux_map)
alloc_flags |= ANV_BO_ALLOC_AUX_TT_ALIGNED;
/* If the allocation is not dedicated nor a host pointer, allocate
* additional CCS space.
*
* TODO: If we ever ship VK_EXT_descriptor_buffer (ahahah... :() we could
* drop this flag in the descriptor buffer case as we don't need any
* compression there.
*
* TODO: We could also create new memory types for allocations that don't
* need any compression.
*/
if (device->physical->alloc_aux_tt_mem &&
dedicated_info == NULL &&
mem->vk.host_ptr == NULL)
alloc_flags |= ANV_BO_ALLOC_AUX_CCS;
/* TODO: Android, ChromeOS and other applications may need another way to
* allocate buffers that can be scanout to display but it should pretty
* easy to catch those as Xe KMD driver will print warnings in dmesg when
* scanning buffers allocated without proper flag set.
*/
if (wsi_info)
alloc_flags |= ANV_BO_ALLOC_SCANOUT;
/* Anything imported or exported is EXTERNAL */
if (mem->vk.export_handle_types || mem->vk.import_handle_type) {
alloc_flags |= ANV_BO_ALLOC_EXTERNAL;
/* wsi has its own way of synchronizing with the compositor */
if (pdevice->instance->external_memory_implicit_sync &&
!wsi_info && dedicated_info &&
dedicated_info->image != VK_NULL_HANDLE) {
ANV_FROM_HANDLE(anv_image, image, dedicated_info->image);
/* Apply implicit sync to be compatible with clients relying on
* implicit fencing. This matches the behavior in iris i915_batch
* submit. An example client is VA-API (iHD), so only dedicated
* image scenario has to be covered.
*/
alloc_flags |= ANV_BO_ALLOC_IMPLICIT_SYNC;
/* For color attachment, apply IMPLICIT_WRITE so a client on the
* consumer side relying on implicit fencing can have a fence to
* wait for render complete.
*/
if (image->vk.usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
alloc_flags |= ANV_BO_ALLOC_IMPLICIT_WRITE;
}
}
if (mem_type->descriptor_buffer)
alloc_flags |= ANV_BO_ALLOC_DESCRIPTOR_BUFFER_POOL;
if (mem->vk.ahardware_buffer) {
result = anv_import_ahw_memory(_device, mem);
if (result != VK_SUCCESS)
goto fail;
goto success;
}
/* The Vulkan spec permits handleType to be 0, in which case the struct is
* ignored.
*/
if (fd_info && fd_info->handleType) {
/* At the moment, we support only the below handle types. */
assert(fd_info->handleType ==
VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
fd_info->handleType ==
VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
result = anv_device_import_bo(device, fd_info->fd, alloc_flags,
client_address, &mem->bo);
if (result != VK_SUCCESS)
goto fail;
/* For security purposes, we reject importing the bo if it's smaller
* than the requested allocation size. This prevents a malicious client
* from passing a buffer to a trusted client, lying about the size, and
* telling the trusted client to try and texture from an image that goes
* out-of-bounds. This sort of thing could lead to GPU hangs or worse
* in the trusted client. The trusted client can protect itself against
* this sort of attack but only if it can trust the buffer size.
*/
if (mem->bo->size < aligned_alloc_size) {
result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
"aligned allocationSize too large for "
"VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: "
"%"PRIu64"B > %"PRIu64"B",
aligned_alloc_size, mem->bo->size);
anv_device_release_bo(device, mem->bo);
goto fail;
}
/* From the Vulkan spec:
*
* "Importing memory from a file descriptor transfers ownership of
* the file descriptor from the application to the Vulkan
* implementation. The application must not perform any operations on
* the file descriptor after a successful import."
*
* If the import fails, we leave the file descriptor open.
*/
close(fd_info->fd);
goto success;
}
if (mem->vk.host_ptr) {
if (mem->vk.import_handle_type ==
VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT) {
result = vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
goto fail;
}
assert(mem->vk.import_handle_type ==
VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT);
result = anv_device_import_bo_from_host_ptr(device,
mem->vk.host_ptr,
mem->vk.size,
alloc_flags,
client_address,
&mem->bo);
if (result != VK_SUCCESS)
goto fail;
goto success;
}
if (alloc_flags & (ANV_BO_ALLOC_EXTERNAL | ANV_BO_ALLOC_SCANOUT)) {
alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
} else if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)
alloc_flags |= ANV_BO_ALLOC_HOST_CACHED;
} else {
/* Required to set some host mode to have a valid pat index set */
alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
}
/* Regular allocate (not importing memory). */
result = anv_device_alloc_bo(device, "user", pAllocateInfo->allocationSize,
alloc_flags, client_address, &mem->bo);
if (result != VK_SUCCESS)
goto fail;
if (dedicated_info && dedicated_info->image != VK_NULL_HANDLE) {
ANV_FROM_HANDLE(anv_image, image, dedicated_info->image);
/* Some legacy (non-modifiers) consumers need the tiling to be set on
* the BO. In this case, we have a dedicated allocation.
*/
if (image->vk.wsi_legacy_scanout) {
const struct isl_surf *surf = &image->planes[0].primary_surface.isl;
result = anv_device_set_bo_tiling(device, mem->bo,
surf->row_pitch_B,
surf->tiling);
if (result != VK_SUCCESS) {
anv_device_release_bo(device, mem->bo);
goto fail;
}
}
}
success:
mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size);
if (mem_heap_used > mem_heap->size) {
p_atomic_add(&mem_heap->used, -mem->bo->size);
anv_device_release_bo(device, mem->bo);
result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Out of heap memory");
goto fail;
}
pthread_mutex_lock(&device->mutex);
list_addtail(&mem->link, &device->memory_objects);
pthread_mutex_unlock(&device->mutex);
ANV_RMV(heap_create, device, mem, false, 0);
*pMem = anv_device_memory_to_handle(mem);
return VK_SUCCESS;
fail:
vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
return result;
}
VkResult anv_GetMemoryFdKHR(
VkDevice device_h,
const VkMemoryGetFdInfoKHR* pGetFdInfo,
int* pFd)
{
ANV_FROM_HANDLE(anv_device, dev, device_h);
ANV_FROM_HANDLE(anv_device_memory, mem, pGetFdInfo->memory);
assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR);
assert(pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
return anv_device_export_bo(dev, mem->bo, pFd);
}
VkResult anv_GetMemoryFdPropertiesKHR(
VkDevice _device,
VkExternalMemoryHandleTypeFlagBits handleType,
int fd,
VkMemoryFdPropertiesKHR* pMemoryFdProperties)
{
ANV_FROM_HANDLE(anv_device, device, _device);
switch (handleType) {
case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
/* dma-buf can be imported as any memory type */
pMemoryFdProperties->memoryTypeBits =
(1 << device->physical->memory.type_count) - 1;
return VK_SUCCESS;
default:
/* The valid usage section for this function says:
*
* "handleType must not be one of the handle types defined as
* opaque."
*
* So opaque handle types fall into the default "unsupported" case.
*/
return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
}
}
VkResult anv_GetMemoryHostPointerPropertiesEXT(
VkDevice _device,
VkExternalMemoryHandleTypeFlagBits handleType,
const void* pHostPointer,
VkMemoryHostPointerPropertiesEXT* pMemoryHostPointerProperties)
{
ANV_FROM_HANDLE(anv_device, device, _device);
assert(pMemoryHostPointerProperties->sType ==
VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT);
switch (handleType) {
case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT:
/* Host memory can be imported as any memory type. */
pMemoryHostPointerProperties->memoryTypeBits =
(1ull << device->physical->memory.type_count) - 1;
return VK_SUCCESS;
default:
return VK_ERROR_INVALID_EXTERNAL_HANDLE;
}
}
void anv_FreeMemory(
VkDevice _device,
VkDeviceMemory _mem,
const VkAllocationCallbacks* pAllocator)
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_device_memory, mem, _mem);
if (mem == NULL)
return;
pthread_mutex_lock(&device->mutex);
list_del(&mem->link);
pthread_mutex_unlock(&device->mutex);
if (mem->map) {
const VkMemoryUnmapInfoKHR unmap = {
.sType = VK_STRUCTURE_TYPE_MEMORY_UNMAP_INFO_KHR,
.memory = _mem,
};
anv_UnmapMemory2KHR(_device, &unmap);
}
p_atomic_add(&device->physical->memory.heaps[mem->type->heapIndex].used,
-mem->bo->size);
anv_device_release_bo(device, mem->bo);
ANV_RMV(resource_destroy, device, mem);
vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
}
VkResult anv_MapMemory2KHR(
VkDevice _device,
const VkMemoryMapInfoKHR* pMemoryMapInfo,
void** ppData)
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryMapInfo->memory);
if (mem == NULL) {
*ppData = NULL;
return VK_SUCCESS;
}
if (mem->vk.host_ptr) {
*ppData = mem->vk.host_ptr + pMemoryMapInfo->offset;
return VK_SUCCESS;
}
/* From the Vulkan spec version 1.0.32 docs for MapMemory:
*
* * memory must have been created with a memory type that reports
* VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
*/
if (!(mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) {
return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
"Memory object not mappable.");
}
assert(pMemoryMapInfo->size > 0);
const VkDeviceSize offset = pMemoryMapInfo->offset;
const VkDeviceSize size =
vk_device_memory_range(&mem->vk, pMemoryMapInfo->offset,
pMemoryMapInfo->size);
if (size != (size_t)size) {
return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
"requested size 0x%"PRIx64" does not fit in %u bits",
size, (unsigned)(sizeof(size_t) * 8));
}
/* From the Vulkan 1.2.194 spec:
*
* "memory must not be currently host mapped"
*/
if (mem->map != NULL) {
return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
"Memory object already mapped.");
}
void *placed_addr = NULL;
if (pMemoryMapInfo->flags & VK_MEMORY_MAP_PLACED_BIT_EXT) {
const VkMemoryMapPlacedInfoEXT *placed_info =
vk_find_struct_const(pMemoryMapInfo->pNext, MEMORY_MAP_PLACED_INFO_EXT);
assert(placed_info != NULL);
placed_addr = placed_info->pPlacedAddress;
}
/* GEM will fail to map if the offset isn't 4k-aligned. Round down. */
uint64_t map_offset;
if (!device->physical->info.has_mmap_offset)
map_offset = offset & ~4095ull;
else
map_offset = 0;
assert(offset >= map_offset);
uint64_t map_size = (offset + size) - map_offset;
/* Let's map whole pages */
map_size = align64(map_size, 4096);
void *map;
VkResult result = anv_device_map_bo(device, mem->bo, map_offset,
map_size, placed_addr, &map);
if (result != VK_SUCCESS)
return result;
mem->map = map;
mem->map_size = map_size;
mem->map_delta = (offset - map_offset);
*ppData = mem->map + mem->map_delta;
return VK_SUCCESS;
}
VkResult anv_UnmapMemory2KHR(
VkDevice _device,
const VkMemoryUnmapInfoKHR* pMemoryUnmapInfo)
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryUnmapInfo->memory);
if (mem == NULL || mem->vk.host_ptr)
return VK_SUCCESS;
VkResult result =
anv_device_unmap_bo(device, mem->bo, mem->map, mem->map_size,
pMemoryUnmapInfo->flags & VK_MEMORY_UNMAP_RESERVE_BIT_EXT);
if (result != VK_SUCCESS)
return result;
mem->map = NULL;
mem->map_size = 0;
mem->map_delta = 0;
return VK_SUCCESS;
}
VkResult anv_FlushMappedMemoryRanges(
VkDevice _device,
uint32_t memoryRangeCount,
const VkMappedMemoryRange* pMemoryRanges)
{
#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
ANV_FROM_HANDLE(anv_device, device, _device);
if (!device->physical->memory.need_flush)
return VK_SUCCESS;
/* Make sure the writes we're flushing have landed. */
__builtin_ia32_mfence();
for (uint32_t i = 0; i < memoryRangeCount; i++) {
ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryRanges[i].memory);
if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
continue;
uint64_t map_offset = pMemoryRanges[i].offset + mem->map_delta;
if (map_offset >= mem->map_size)
continue;
intel_flush_range(mem->map + map_offset,
MIN2(pMemoryRanges[i].size,
mem->map_size - map_offset));
}
#endif
return VK_SUCCESS;
}
VkResult anv_InvalidateMappedMemoryRanges(
VkDevice _device,
uint32_t memoryRangeCount,
const VkMappedMemoryRange* pMemoryRanges)
{
#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
ANV_FROM_HANDLE(anv_device, device, _device);
if (!device->physical->memory.need_flush)
return VK_SUCCESS;
for (uint32_t i = 0; i < memoryRangeCount; i++) {
ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryRanges[i].memory);
if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
continue;
uint64_t map_offset = pMemoryRanges[i].offset + mem->map_delta;
if (map_offset >= mem->map_size)
continue;
intel_invalidate_range(mem->map + map_offset,
MIN2(pMemoryRanges[i].size,
mem->map_size - map_offset));
}
/* Make sure no reads get moved up above the invalidate. */
__builtin_ia32_mfence();
#endif
return VK_SUCCESS;
}
void anv_GetDeviceMemoryCommitment(
VkDevice device,
VkDeviceMemory memory,
VkDeviceSize* pCommittedMemoryInBytes)
{
*pCommittedMemoryInBytes = 0;
}
static void
anv_bind_buffer_memory(struct anv_device *device,
const VkBindBufferMemoryInfo *pBindInfo)
{
ANV_FROM_HANDLE(anv_device_memory, mem, pBindInfo->memory);
ANV_FROM_HANDLE(anv_buffer, buffer, pBindInfo->buffer);
assert(pBindInfo->sType == VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO);
assert(!anv_buffer_is_sparse(buffer));
const VkBindMemoryStatusKHR *bind_status =
vk_find_struct_const(pBindInfo->pNext, BIND_MEMORY_STATUS_KHR);
if (mem) {
assert(pBindInfo->memoryOffset < mem->vk.size);
assert(mem->vk.size - pBindInfo->memoryOffset >= buffer->vk.size);
buffer->address = (struct anv_address) {
.bo = mem->bo,
.offset = pBindInfo->memoryOffset,
};
} else {
buffer->address = ANV_NULL_ADDRESS;
}
ANV_RMV(buffer_bind, device, buffer);
if (bind_status)
*bind_status->pResult = VK_SUCCESS;
}
VkResult anv_BindBufferMemory2(
VkDevice _device,
uint32_t bindInfoCount,
const VkBindBufferMemoryInfo* pBindInfos)
{
ANV_FROM_HANDLE(anv_device, device, _device);
for (uint32_t i = 0; i < bindInfoCount; i++)
anv_bind_buffer_memory(device, &pBindInfos[i]);
return VK_SUCCESS;
}
// Event functions
VkResult anv_CreateEvent(
VkDevice _device,
const VkEventCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator,
VkEvent* pEvent)
{
ANV_FROM_HANDLE(anv_device, device, _device);
struct anv_event *event;
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_EVENT_CREATE_INFO);
event = vk_object_alloc(&device->vk, pAllocator, sizeof(*event),
VK_OBJECT_TYPE_EVENT);
if (event == NULL)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
event->state = anv_state_pool_alloc(&device->dynamic_state_pool,
sizeof(uint64_t), 8);
*(uint64_t *)event->state.map = VK_EVENT_RESET;
ANV_RMV(event_create, device, event, pCreateInfo->flags, false);
*pEvent = anv_event_to_handle(event);
return VK_SUCCESS;
}
void anv_DestroyEvent(
VkDevice _device,
VkEvent _event,
const VkAllocationCallbacks* pAllocator)
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_event, event, _event);
if (!event)
return;
ANV_RMV(resource_destroy, device, event);
anv_state_pool_free(&device->dynamic_state_pool, event->state);
vk_object_free(&device->vk, pAllocator, event);
}
VkResult anv_GetEventStatus(
VkDevice _device,
VkEvent _event)
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_event, event, _event);
if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
return *(uint64_t *)event->state.map;
}
VkResult anv_SetEvent(
VkDevice _device,
VkEvent _event)
{
ANV_FROM_HANDLE(anv_event, event, _event);
*(uint64_t *)event->state.map = VK_EVENT_SET;
return VK_SUCCESS;
}
VkResult anv_ResetEvent(
VkDevice _device,
VkEvent _event)
{
ANV_FROM_HANDLE(anv_event, event, _event);
*(uint64_t *)event->state.map = VK_EVENT_RESET;
return VK_SUCCESS;
}
// Buffer functions
static void
anv_get_buffer_memory_requirements(struct anv_device *device,
VkBufferCreateFlags flags,
VkDeviceSize size,
VkBufferUsageFlags usage,
bool is_sparse,
VkMemoryRequirements2* pMemoryRequirements)
{
/* The Vulkan spec (git aaed022) says:
*
* memoryTypeBits is a bitfield and contains one bit set for every
* supported memory type for the resource. The bit `1<<i` is set if and
* only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
* structure for the physical device is supported.
*
* We have special memory types for descriptor buffers.
*/
uint32_t memory_types =
(flags & VK_BUFFER_CREATE_PROTECTED_BIT) ?
device->physical->memory.protected_mem_types :
((usage & (VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT |
VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT)) ?
device->physical->memory.desc_buffer_mem_types :
device->physical->memory.default_buffer_mem_types);
/* The GPU appears to write back to main memory in cachelines. Writes to a
* buffers should not clobber with writes to another buffers so make sure
* those are in different cachelines.
*/
uint32_t alignment = 64;
/* From the spec, section "Sparse Buffer and Fully-Resident Image Block
* Size":
* "The sparse block size in bytes for sparse buffers and fully-resident
* images is reported as VkMemoryRequirements::alignment. alignment
* represents both the memory alignment requirement and the binding
* granularity (in bytes) for sparse resources."
*/
if (is_sparse) {
alignment = ANV_SPARSE_BLOCK_SIZE;
size = align64(size, alignment);
}
pMemoryRequirements->memoryRequirements.size = size;
pMemoryRequirements->memoryRequirements.alignment = alignment;
/* Storage and Uniform buffers should have their size aligned to
* 32-bits to avoid boundary checks when last DWord is not complete.
* This would ensure that not internal padding would be needed for
* 16-bit types.
*/
if (device->robust_buffer_access &&
(usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT ||
usage & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT))
pMemoryRequirements->memoryRequirements.size = align64(size, 4);
pMemoryRequirements->memoryRequirements.memoryTypeBits = memory_types;
vk_foreach_struct(ext, pMemoryRequirements->pNext) {
switch (ext->sType) {
case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
VkMemoryDedicatedRequirements *requirements = (void *)ext;
requirements->prefersDedicatedAllocation = false;
requirements->requiresDedicatedAllocation = false;
break;
}
default:
vk_debug_ignored_stype(ext->sType);
break;
}
}
}
void anv_GetDeviceBufferMemoryRequirements(
VkDevice _device,
const VkDeviceBufferMemoryRequirements* pInfo,
VkMemoryRequirements2* pMemoryRequirements)
{
ANV_FROM_HANDLE(anv_device, device, _device);
const bool is_sparse =
pInfo->pCreateInfo->flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT;
if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
INTEL_DEBUG(DEBUG_SPARSE) &&
pInfo->pCreateInfo->flags & (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT |
VK_BUFFER_CREATE_SPARSE_ALIASED_BIT))
fprintf(stderr, "=== %s %s:%d flags:0x%08x\n", __func__, __FILE__,
__LINE__, pInfo->pCreateInfo->flags);
anv_get_buffer_memory_requirements(device,
pInfo->pCreateInfo->flags,
pInfo->pCreateInfo->size,
pInfo->pCreateInfo->usage,
is_sparse,
pMemoryRequirements);
}
VkResult anv_CreateBuffer(
VkDevice _device,
const VkBufferCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator,
VkBuffer* pBuffer)
{
ANV_FROM_HANDLE(anv_device, device, _device);
struct anv_buffer *buffer;
if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
INTEL_DEBUG(DEBUG_SPARSE) &&
pCreateInfo->flags & (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT |
VK_BUFFER_CREATE_SPARSE_ALIASED_BIT))
fprintf(stderr, "=== %s %s:%d flags:0x%08x\n", __func__, __FILE__,
__LINE__, pCreateInfo->flags);
/* Don't allow creating buffers bigger than our address space. The real
* issue here is that we may align up the buffer size and we don't want
* doing so to cause roll-over. However, no one has any business
* allocating a buffer larger than our GTT size.
*/
if (pCreateInfo->size > device->physical->gtt_size)
return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
buffer = vk_buffer_create(&device->vk, pCreateInfo,
pAllocator, sizeof(*buffer));
if (buffer == NULL)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
buffer->address = ANV_NULL_ADDRESS;
if (anv_buffer_is_sparse(buffer)) {
enum anv_bo_alloc_flags alloc_flags = 0;
uint64_t client_address = 0;
if (buffer->vk.create_flags & VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT) {
alloc_flags = ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
const VkBufferOpaqueCaptureAddressCreateInfo *opaque_addr_info =
vk_find_struct_const(pCreateInfo->pNext,
BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO);
if (opaque_addr_info)
client_address = opaque_addr_info->opaqueCaptureAddress;
}
if (buffer->vk.create_flags & VK_BUFFER_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT) {
alloc_flags = ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
const VkOpaqueCaptureDescriptorDataCreateInfoEXT *opaque_info =
vk_find_struct_const(pCreateInfo->pNext,
OPAQUE_CAPTURE_DESCRIPTOR_DATA_CREATE_INFO_EXT);
if (opaque_info)
client_address = *((const uint64_t *)opaque_info->opaqueCaptureDescriptorData);
}
VkResult result = anv_init_sparse_bindings(device, buffer->vk.size,
&buffer->sparse_data,
alloc_flags, client_address,
&buffer->address);
if (result != VK_SUCCESS) {
vk_buffer_destroy(&device->vk, pAllocator, &buffer->vk);
return result;
}
}
ANV_RMV(buffer_create, device, false, buffer);
*pBuffer = anv_buffer_to_handle(buffer);
return VK_SUCCESS;
}
void anv_DestroyBuffer(
VkDevice _device,
VkBuffer _buffer,
const VkAllocationCallbacks* pAllocator)
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
if (!buffer)
return;
ANV_RMV(buffer_destroy, device, buffer);
if (anv_buffer_is_sparse(buffer)) {
assert(buffer->address.offset == buffer->sparse_data.address);
anv_free_sparse_bindings(device, &buffer->sparse_data);
}
vk_buffer_destroy(&device->vk, pAllocator, &buffer->vk);
}
VkDeviceAddress anv_GetBufferDeviceAddress(
VkDevice device,
const VkBufferDeviceAddressInfo* pInfo)
{
ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
assert(!anv_address_is_null(buffer->address));
return anv_address_physical(buffer->address);
}
uint64_t anv_GetBufferOpaqueCaptureAddress(
VkDevice device,
const VkBufferDeviceAddressInfo* pInfo)
{
ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
return anv_address_physical(buffer->address);
}
VkResult anv_GetBufferOpaqueCaptureDescriptorDataEXT(
VkDevice device,
const VkBufferCaptureDescriptorDataInfoEXT* pInfo,
void* pData)
{
ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
*((uint64_t *)pData) = anv_address_physical(buffer->address);
return VK_SUCCESS;
}
uint64_t anv_GetDeviceMemoryOpaqueCaptureAddress(
VkDevice device,
const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo)
{
ANV_FROM_HANDLE(anv_device_memory, memory, pInfo->memory);
assert(memory->bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS);
return intel_48b_address(memory->bo->offset);
}
void
anv_fill_buffer_surface_state(struct anv_device *device,
void *surface_state_ptr,
enum isl_format format,
struct isl_swizzle swizzle,
isl_surf_usage_flags_t usage,
struct anv_address address,
uint32_t range, uint32_t stride)
{
isl_buffer_fill_state(&device->isl_dev, surface_state_ptr,
.address = anv_address_physical(address),
.mocs = isl_mocs(&device->isl_dev, usage,
address.bo && anv_bo_is_external(address.bo)),
.size_B = range,
.format = format,
.swizzle = swizzle,
.stride_B = stride);
}
VkResult anv_GetSamplerOpaqueCaptureDescriptorDataEXT(
VkDevice _device,
const VkSamplerCaptureDescriptorDataInfoEXT* pInfo,
void* pData)
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_sampler, sampler, pInfo->sampler);
if (sampler->custom_border_color_db.alloc_size != 0) {
*((uint32_t *)pData) =
anv_state_reserved_array_pool_state_index(
&device->custom_border_colors_db,
sampler->custom_border_color_db);
} else {
*((uint32_t *)pData) = 0;
}
return VK_SUCCESS;
}
void anv_DestroySampler(
VkDevice _device,
VkSampler _sampler,
const VkAllocationCallbacks* pAllocator)
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_sampler, sampler, _sampler);
if (!sampler)
return;
if (sampler->bindless_state.map) {
anv_state_pool_free(&device->dynamic_state_pool,
sampler->bindless_state);
}
if (sampler->custom_border_color.map) {
anv_state_reserved_pool_free(&device->custom_border_colors,
sampler->custom_border_color);
}
if (sampler->custom_border_color_db.map) {
anv_state_reserved_array_pool_free(&device->custom_border_colors_db,
sampler->custom_border_color_db);
}
vk_sampler_destroy(&device->vk, pAllocator, &sampler->vk);
}
static const VkTimeDomainKHR anv_time_domains[] = {
VK_TIME_DOMAIN_DEVICE_KHR,
VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR,
#ifdef CLOCK_MONOTONIC_RAW
VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR,
#endif
};
VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsKHR(
VkPhysicalDevice physicalDevice,
uint32_t *pTimeDomainCount,
VkTimeDomainKHR *pTimeDomains)
{
int d;
VK_OUTARRAY_MAKE_TYPED(VkTimeDomainKHR, out, pTimeDomains, pTimeDomainCount);
for (d = 0; d < ARRAY_SIZE(anv_time_domains); d++) {
vk_outarray_append_typed(VkTimeDomainKHR, &out, i) {
*i = anv_time_domains[d];
}
}
return vk_outarray_status(&out);
}
static inline clockid_t
anv_get_default_cpu_clock_id(void)
{
#ifdef CLOCK_MONOTONIC_RAW
return CLOCK_MONOTONIC_RAW;
#else
return CLOCK_MONOTONIC;
#endif
}
static inline clockid_t
vk_time_domain_to_clockid(VkTimeDomainKHR domain)
{
switch (domain) {
#ifdef CLOCK_MONOTONIC_RAW
case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
return CLOCK_MONOTONIC_RAW;
#endif
case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
return CLOCK_MONOTONIC;
default:
unreachable("Missing");
return CLOCK_MONOTONIC;
}
}
static inline bool
is_cpu_time_domain(VkTimeDomainKHR domain)
{
return domain == VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR ||
domain == VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR;
}
static inline bool
is_gpu_time_domain(VkTimeDomainKHR domain)
{
return domain == VK_TIME_DOMAIN_DEVICE_KHR;
}
VkResult anv_GetCalibratedTimestampsKHR(
VkDevice _device,
uint32_t timestampCount,
const VkCalibratedTimestampInfoKHR *pTimestampInfos,
uint64_t *pTimestamps,
uint64_t *pMaxDeviation)
{
ANV_FROM_HANDLE(anv_device, device, _device);
const uint64_t timestamp_frequency = device->info->timestamp_frequency;
const uint64_t device_period = DIV_ROUND_UP(1000000000, timestamp_frequency);
uint32_t d, increment;
uint64_t begin, end;
uint64_t max_clock_period = 0;
const enum intel_kmd_type kmd_type = device->physical->info.kmd_type;
const bool has_correlate_timestamp = kmd_type == INTEL_KMD_TYPE_XE;
clockid_t cpu_clock_id = -1;
begin = end = vk_clock_gettime(anv_get_default_cpu_clock_id());
for (d = 0, increment = 1; d < timestampCount; d += increment) {
const VkTimeDomainKHR current = pTimestampInfos[d].timeDomain;
/* If we have a request pattern like this :
* - domain0 = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR or VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR
* - domain1 = VK_TIME_DOMAIN_DEVICE_KHR
* - domain2 = domain0 (optional)
*
* We can combine all of those into a single ioctl for maximum accuracy.
*/
if (has_correlate_timestamp && (d + 1) < timestampCount) {
const VkTimeDomainKHR next = pTimestampInfos[d + 1].timeDomain;
if ((is_cpu_time_domain(current) && is_gpu_time_domain(next)) ||
(is_gpu_time_domain(current) && is_cpu_time_domain(next))) {
/* We'll consume at least 2 elements. */
increment = 2;
if (is_cpu_time_domain(current))
cpu_clock_id = vk_time_domain_to_clockid(current);
else
cpu_clock_id = vk_time_domain_to_clockid(next);
uint64_t cpu_timestamp, gpu_timestamp, cpu_delta_timestamp, cpu_end_timestamp;
if (!intel_gem_read_correlate_cpu_gpu_timestamp(device->fd,
kmd_type,
INTEL_ENGINE_CLASS_RENDER,
0 /* engine_instance */,
cpu_clock_id,
&cpu_timestamp,
&gpu_timestamp,
&cpu_delta_timestamp))
return vk_device_set_lost(&device->vk, "Failed to read correlate timestamp %m");
cpu_end_timestamp = cpu_timestamp + cpu_delta_timestamp;
if (is_cpu_time_domain(current)) {
pTimestamps[d] = cpu_timestamp;
pTimestamps[d + 1] = gpu_timestamp;
} else {
pTimestamps[d] = gpu_timestamp;
pTimestamps[d + 1] = cpu_end_timestamp;
}
max_clock_period = MAX2(max_clock_period, device_period);
/* If we can consume a third element */
if ((d + 2) < timestampCount &&
is_cpu_time_domain(current) &&
current == pTimestampInfos[d + 2].timeDomain) {
pTimestamps[d + 2] = cpu_end_timestamp;
increment++;
}
/* If we're the first element, we can replace begin */
if (d == 0 && cpu_clock_id == anv_get_default_cpu_clock_id())
begin = cpu_timestamp;
/* If we're in the same clock domain as begin/end. We can set the end. */
if (cpu_clock_id == anv_get_default_cpu_clock_id())
end = cpu_end_timestamp;
continue;
}
}
/* fallback to regular method */
increment = 1;
switch (current) {
case VK_TIME_DOMAIN_DEVICE_KHR:
if (!intel_gem_read_render_timestamp(device->fd,
device->info->kmd_type,
&pTimestamps[d])) {
return vk_device_set_lost(&device->vk, "Failed to read the "
"TIMESTAMP register: %m");
}
max_clock_period = MAX2(max_clock_period, device_period);
break;
case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC);
max_clock_period = MAX2(max_clock_period, 1);
break;
#ifdef CLOCK_MONOTONIC_RAW
case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
pTimestamps[d] = begin;
break;
#endif
default:
pTimestamps[d] = 0;
break;
}
}
/* If last timestamp was not get with has_correlate_timestamp method or
* if it was but last cpu clock is not the default one, get time again
*/
if (increment == 1 || cpu_clock_id != anv_get_default_cpu_clock_id())
end = vk_clock_gettime(anv_get_default_cpu_clock_id());
*pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period);
return VK_SUCCESS;
}
void anv_GetPhysicalDeviceMultisamplePropertiesEXT(
VkPhysicalDevice physicalDevice,
VkSampleCountFlagBits samples,
VkMultisamplePropertiesEXT* pMultisampleProperties)
{
ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
assert(pMultisampleProperties->sType ==
VK_STRUCTURE_TYPE_MULTISAMPLE_PROPERTIES_EXT);
VkExtent2D grid_size;
if (samples & isl_device_get_sample_counts(&physical_device->isl_dev)) {
grid_size.width = 1;
grid_size.height = 1;
} else {
grid_size.width = 0;
grid_size.height = 0;
}
pMultisampleProperties->maxSampleLocationGridSize = grid_size;
vk_foreach_struct(ext, pMultisampleProperties->pNext)
vk_debug_ignored_stype(ext->sType);
}
VkResult anv_GetPhysicalDeviceFragmentShadingRatesKHR(
VkPhysicalDevice physicalDevice,
uint32_t* pFragmentShadingRateCount,
VkPhysicalDeviceFragmentShadingRateKHR* pFragmentShadingRates)
{
ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
VK_OUTARRAY_MAKE_TYPED(VkPhysicalDeviceFragmentShadingRateKHR, out,
pFragmentShadingRates, pFragmentShadingRateCount);
#define append_rate(_samples, _width, _height) \
do { \
vk_outarray_append_typed(VkPhysicalDeviceFragmentShadingRateKHR, &out, __r) { \
__r->sampleCounts = _samples; \
__r->fragmentSize = (VkExtent2D) { \
.width = _width, \
.height = _height, \
}; \
} \
} while (0)
VkSampleCountFlags sample_counts =
isl_device_get_sample_counts(&physical_device->isl_dev);
/* BSpec 47003: There are a number of restrictions on the sample count
* based off the coarse pixel size.
*/
static const VkSampleCountFlags cp_size_sample_limits[] = {
[1] = ISL_SAMPLE_COUNT_16_BIT | ISL_SAMPLE_COUNT_8_BIT |
ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
[2] = ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
[4] = ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
[8] = ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
[16] = ISL_SAMPLE_COUNT_1_BIT,
};
for (uint32_t x = 4; x >= 1; x /= 2) {
for (uint32_t y = 4; y >= 1; y /= 2) {
if (physical_device->info.has_coarse_pixel_primitive_and_cb) {
/* BSpec 47003:
* "CPsize 1x4 and 4x1 are not supported"
*/
if ((x == 1 && y == 4) || (x == 4 && y == 1))
continue;
/* For size {1, 1}, the sample count must be ~0
*
* 4x2 is also a specially case.
*/
if (x == 1 && y == 1)
append_rate(~0, x, y);
else if (x == 4 && y == 2)
append_rate(ISL_SAMPLE_COUNT_1_BIT, x, y);
else
append_rate(cp_size_sample_limits[x * y], x, y);
} else {
/* For size {1, 1}, the sample count must be ~0 */
if (x == 1 && y == 1)
append_rate(~0, x, y);
else
append_rate(sample_counts, x, y);
}
}
}
#undef append_rate
return vk_outarray_status(&out);
}
const struct intel_device_info_pat_entry *
anv_device_get_pat_entry(struct anv_device *device,
enum anv_bo_alloc_flags alloc_flags)
{
if (alloc_flags & ANV_BO_ALLOC_IMPORTED)
return &device->info->pat.cached_coherent;
/* PAT indexes has no actual effect in DG2 and DG1, smem caches will always
* be snopped by GPU and lmem will always be WC.
* This might change in future discrete platforms.
*/
if (anv_physical_device_has_vram(device->physical)) {
if (alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM)
return &device->info->pat.cached_coherent;
return &device->info->pat.writecombining;
}
if ((alloc_flags & (ANV_BO_ALLOC_HOST_CACHED_COHERENT)) == ANV_BO_ALLOC_HOST_CACHED_COHERENT)
return &device->info->pat.cached_coherent;
else if (alloc_flags & (ANV_BO_ALLOC_EXTERNAL | ANV_BO_ALLOC_SCANOUT))
return &device->info->pat.scanout;
else if (alloc_flags & ANV_BO_ALLOC_HOST_CACHED)
return &device->info->pat.writeback_incoherent;
else
return &device->info->pat.writecombining;
}
static VkComponentTypeKHR
convert_component_type(enum intel_cooperative_matrix_component_type t)
{
switch (t) {
case INTEL_CMAT_FLOAT16: return VK_COMPONENT_TYPE_FLOAT16_KHR;
case INTEL_CMAT_FLOAT32: return VK_COMPONENT_TYPE_FLOAT32_KHR;
case INTEL_CMAT_SINT32: return VK_COMPONENT_TYPE_SINT32_KHR;
case INTEL_CMAT_SINT8: return VK_COMPONENT_TYPE_SINT8_KHR;
case INTEL_CMAT_UINT32: return VK_COMPONENT_TYPE_UINT32_KHR;
case INTEL_CMAT_UINT8: return VK_COMPONENT_TYPE_UINT8_KHR;
}
unreachable("invalid cooperative matrix component type in configuration");
}
static VkScopeKHR
convert_scope(enum intel_cmat_scope scope)
{
switch (scope) {
case INTEL_CMAT_SCOPE_SUBGROUP: return VK_SCOPE_SUBGROUP_KHR;
default:
unreachable("invalid cooperative matrix scope in configuration");
}
}
VkResult anv_GetPhysicalDeviceCooperativeMatrixPropertiesKHR(
VkPhysicalDevice physicalDevice,
uint32_t* pPropertyCount,
VkCooperativeMatrixPropertiesKHR* pProperties)
{
ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
const struct intel_device_info *devinfo = &pdevice->info;
assert(anv_has_cooperative_matrix(pdevice));
VK_OUTARRAY_MAKE_TYPED(VkCooperativeMatrixPropertiesKHR, out, pProperties, pPropertyCount);
for (int i = 0; i < ARRAY_SIZE(devinfo->cooperative_matrix_configurations); i++) {
const struct intel_cooperative_matrix_configuration *cfg =
&devinfo->cooperative_matrix_configurations[i];
if (cfg->scope == INTEL_CMAT_SCOPE_NONE)
break;
vk_outarray_append_typed(VkCooperativeMatrixPropertiesKHR, &out, prop) {
prop->sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
prop->MSize = cfg->m;
prop->NSize = cfg->n;
prop->KSize = cfg->k;
prop->AType = convert_component_type(cfg->a);
prop->BType = convert_component_type(cfg->b);
prop->CType = convert_component_type(cfg->c);
prop->ResultType = convert_component_type(cfg->result);
prop->saturatingAccumulation = VK_FALSE;
prop->scope = convert_scope(cfg->scope);
}
/* VUID-RuntimeSpirv-saturatingAccumulation-08983 says:
*
* For OpCooperativeMatrixMulAddKHR, the SaturatingAccumulation
* cooperative matrix operand must be present if and only if
* VkCooperativeMatrixPropertiesKHR::saturatingAccumulation is
* VK_TRUE.
*
* As a result, we have to advertise integer configs both with and
* without this flag set.
*
* The DPAS instruction does not support the .sat modifier, so only
* advertise the configurations when the DPAS would be lowered.
*
* FINISHME: It should be possible to do better than full lowering on
* platforms that support DPAS. Emit a DPAS with a NULL accumulator
* argument, then perform the correct sequence of saturating add
* instructions.
*/
if (cfg->a != INTEL_CMAT_FLOAT16 &&
(devinfo->verx10 < 125 || debug_get_bool_option("INTEL_LOWER_DPAS", false))) {
vk_outarray_append_typed(VkCooperativeMatrixPropertiesKHR, &out, prop) {
prop->sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
prop->MSize = cfg->m;
prop->NSize = cfg->n;
prop->KSize = cfg->k;
prop->AType = convert_component_type(cfg->a);
prop->BType = convert_component_type(cfg->b);
prop->CType = convert_component_type(cfg->c);
prop->ResultType = convert_component_type(cfg->result);
prop->saturatingAccumulation = VK_TRUE;
prop->scope = convert_scope(cfg->scope);
}
}
}
return vk_outarray_status(&out);
}