diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 9ea47b09b33..945ac7686cd 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -980,11 +980,6 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer, pipe_state->push_constants_data_dirty = true; } -#define ANV_GRAPHICS_STAGE_BITS \ - (VK_SHADER_STAGE_ALL_GRAPHICS | \ - VK_SHADER_STAGE_MESH_BIT_EXT | \ - VK_SHADER_STAGE_TASK_BIT_EXT) - void anv_CmdBindDescriptorSets2KHR( VkCommandBuffer commandBuffer, const VkBindDescriptorSetsInfoKHR* pInfo) diff --git a/src/intel/vulkan/anv_physical_device.c b/src/intel/vulkan/anv_physical_device.c index 2c3764436f3..650217f864a 100644 --- a/src/intel/vulkan/anv_physical_device.c +++ b/src/intel/vulkan/anv_physical_device.c @@ -2013,6 +2013,11 @@ get_properties(const struct anv_physical_device *pdevice, /* We support 4k/64k tiling alignments on most platforms */ props->supportedImageAlignmentMask = (1 << 12) | (1 << 16); } + + /* For the runtime common code (even thought we don't support + * VK_EXT_shader_object) + */ + memcpy(props->shaderBinaryUUID, pdevice->shader_binary_uuid, VK_UUID_SIZE); } /* This function restricts the maximum size of system memory heap. The @@ -2761,6 +2766,8 @@ anv_physical_device_try_create(struct vk_instance *vk_instance, anv_physical_device_init_perf(device, fd); + anv_shader_init_uuid(device); + /* Gather major/minor before WSI. */ struct stat st; diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index ff04ec2e636..dbfd0c1bc6c 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -97,6 +97,7 @@ #include "vk_pipeline_layout.h" #include "vk_physical_device.h" #include "vk_sampler.h" +#include "vk_shader.h" #include "vk_shader_module.h" #include "vk_sync.h" #include "vk_texcompress_astc.h" @@ -151,6 +152,10 @@ struct intel_perf_query_result; #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC_FAST #endif +#define ANV_GRAPHICS_STAGE_BITS (VK_SHADER_STAGE_ALL_GRAPHICS | \ + VK_SHADER_STAGE_MESH_BIT_EXT | \ + VK_SHADER_STAGE_TASK_BIT_EXT) + #define ANV_RT_STAGE_BITS (VK_SHADER_STAGE_RAYGEN_BIT_KHR | \ VK_SHADER_STAGE_ANY_HIT_BIT_KHR | \ VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | \ @@ -158,6 +163,11 @@ struct intel_perf_query_result; VK_SHADER_STAGE_INTERSECTION_BIT_KHR | \ VK_SHADER_STAGE_CALLABLE_BIT_KHR) +#define ANV_VK_STAGE_MASK (ANV_GRAPHICS_STAGE_BITS | \ + ANV_RT_STAGE_BITS | \ + VK_SHADER_STAGE_COMPUTE_BIT) + + #define NSEC_PER_SEC 1000000000ull #define BINDING_TABLE_POOL_BLOCK_SIZE (65536) @@ -245,6 +255,7 @@ get_max_vbs(const struct intel_device_info *devinfo) { #define ANV_BINDLESS_SURFACE_BASE_ADDR_REG 0x2668 /* MI_ALU_REG13 */ #define ANV_GRAPHICS_SHADER_STAGE_COUNT (MESA_SHADER_MESH + 1) +#define ANV_RT_SHADER_STAGE_COUNT (MESA_SHADER_CALLABLE - MESA_SHADER_RAYGEN + 1) /* Defines where various values are defined in the inline parameter register. */ @@ -1022,6 +1033,188 @@ VkResult anv_bo_cache_init(struct anv_bo_cache *cache, struct anv_device *device); void anv_bo_cache_finish(struct anv_bo_cache *cache); +/* Relocations */ +struct anv_reloc_list { + bool uses_relocs; + uint32_t dep_words; + BITSET_WORD * deps; + const VkAllocationCallbacks *alloc; +}; + +VkResult anv_reloc_list_init(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + bool uses_relocs); +void anv_reloc_list_finish(struct anv_reloc_list *list); + +VkResult +anv_reloc_list_add_bo_impl(struct anv_reloc_list *list, struct anv_bo *target_bo); + +static inline VkResult +anv_reloc_list_add_bo(struct anv_reloc_list *list, struct anv_bo *target_bo) +{ + return list->uses_relocs ? anv_reloc_list_add_bo_impl(list, target_bo) : VK_SUCCESS; +} + +VkResult anv_reloc_list_append(struct anv_reloc_list *list, + struct anv_reloc_list *other); + +/* Shaders */ + +#define ANV_DESCRIPTOR_SET_PER_PRIM_PADDING (UINT8_MAX - 5) +#define ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER (UINT8_MAX - 4) +#define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 3) +#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 2) +#define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 1) +#define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX + +struct anv_pipeline_binding { + /** Index in the descriptor set + * + * This is a flattened index; the descriptor set layout is already taken + * into account. + */ + uint32_t index; + + /** Binding in the descriptor set. Not valid for any of the + * ANV_DESCRIPTOR_SET_* + */ + uint32_t binding; + + /** Offset in the descriptor buffer + * + * Relative to anv_descriptor_set::desc_addr. This is useful for + * ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT, to generate the binding + * table entry. + */ + uint32_t set_offset; + + /** The descriptor set this surface corresponds to. + * + * The special ANV_DESCRIPTOR_SET_* values above indicates that this + * binding is not a normal descriptor set but something else. + */ + uint8_t set; + + union { + /** Plane in the binding index for images */ + uint8_t plane; + + /** Input attachment index (relative to the subpass) */ + uint8_t input_attachment_index; + + /** Dynamic offset index + * + * For dynamic UBOs and SSBOs, relative to set. + */ + uint8_t dynamic_offset_index; + }; +}; + +struct anv_embedded_sampler_key { + /** No need to track binding elements for embedded samplers as : + * + * VUID-VkDescriptorSetLayoutBinding-flags-08006: + * + * "If VkDescriptorSetLayoutCreateInfo:flags contains + * VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT, + * descriptorCount must: less than or equal to 1" + * + * The following struct can be safely hash as it doesn't include in + * address/offset. + */ + uint32_t sampler[4]; + uint32_t color[4]; +}; + +struct anv_pipeline_embedded_sampler_binding { + /** The descriptor set this sampler belongs to */ + uint8_t set; + + /** The binding in the set this sampler belongs to */ + uint32_t binding; + + /** The data configuring the sampler */ + struct anv_embedded_sampler_key key; +}; + +struct anv_push_range { + /** Index in the descriptor set */ + uint32_t index; + + /** Descriptor set index */ + uint8_t set; + + /** Dynamic offset index (for dynamic UBOs), relative to set. */ + uint8_t dynamic_offset_index; + + /** Start offset in units of 32B */ + uint8_t start; + + /** Range in units of 32B */ + uint8_t length; +}; + +struct anv_pipeline_bind_map { + unsigned char surface_sha1[20]; + unsigned char sampler_sha1[20]; + unsigned char push_sha1[20]; + + /* enum anv_descriptor_set_layout_type */ + uint32_t layout_type; + + uint32_t surface_count; + uint32_t sampler_count; + uint32_t embedded_sampler_count; + + struct anv_pipeline_binding * surface_to_descriptor; + struct anv_pipeline_binding * sampler_to_descriptor; + struct anv_pipeline_embedded_sampler_binding* embedded_sampler_to_binding; + BITSET_DECLARE(input_attachments, MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS + 1); + + struct anv_push_range push_ranges[4]; +}; + +struct anv_push_descriptor_info { + /* A bitfield of descriptors used. */ + uint32_t used_descriptors; + + /* A bitfield of UBOs bindings fully promoted to push constants. */ + uint32_t fully_promoted_ubo_descriptors; + + /* A bitfield with one bit set indicating the push descriptor set used. */ + uint8_t push_set_buffer; +}; + +struct anv_shader { + struct vk_shader vk; + + struct anv_state kernel; + + const struct brw_stage_prog_data *prog_data; + uint32_t prog_data_size; + + struct brw_compile_stats stats[3]; + uint32_t num_stats; + + struct nir_xfb_info *xfb_info; + + struct anv_push_descriptor_info push_desc_info; + + struct anv_pipeline_bind_map bind_map; + + uint32_t instance_multiplier; + + /* Not saved in the pipeline cache. + * + * Array of pointers of length bind_map.embedded_sampler_count + */ + struct anv_embedded_sampler **embedded_samplers; +}; + +extern struct vk_device_shader_ops anv_device_shader_ops; + +/* Physical device */ + struct anv_queue_family { /* Standard bits passed on to the client */ VkQueueFlags queueFlags; @@ -1274,6 +1467,7 @@ struct anv_physical_device { struct anv_memregion vram_non_mappable; struct anv_memregion sys; uint8_t driver_build_sha1[20]; + uint8_t shader_binary_uuid[VK_UUID_SIZE]; uint8_t pipeline_cache_uuid[VK_UUID_SIZE]; uint8_t driver_uuid[VK_UUID_SIZE]; uint8_t device_uuid[VK_UUID_SIZE]; @@ -1458,7 +1652,6 @@ struct anv_queue { }; struct nir_xfb_info; -struct anv_pipeline_bind_map; struct anv_pipeline_sets_layout; struct anv_push_descriptor_info; @@ -2598,30 +2791,6 @@ anv_bo_is_small_heap(enum anv_bo_alloc_flags alloc_flags) ANV_BO_ALLOC_32BIT_ADDRESS); } -struct anv_reloc_list { - bool uses_relocs; - uint32_t dep_words; - BITSET_WORD * deps; - const VkAllocationCallbacks *alloc; -}; - -VkResult anv_reloc_list_init(struct anv_reloc_list *list, - const VkAllocationCallbacks *alloc, - bool uses_relocs); -void anv_reloc_list_finish(struct anv_reloc_list *list); - -VkResult -anv_reloc_list_add_bo_impl(struct anv_reloc_list *list, struct anv_bo *target_bo); - -static inline VkResult -anv_reloc_list_add_bo(struct anv_reloc_list *list, struct anv_bo *target_bo) -{ - return list->uses_relocs ? anv_reloc_list_add_bo_impl(list, target_bo) : VK_SUCCESS; -} - -VkResult anv_reloc_list_append(struct anv_reloc_list *list, - struct anv_reloc_list *other); - struct anv_batch_bo { /* Link in the anv_cmd_buffer.owned_batch_bos list */ struct list_head link; @@ -3057,22 +3226,6 @@ enum anv_descriptor_data { ANV_DESCRIPTOR_SURFACE_SAMPLER = BITFIELD_BIT(9), }; -struct anv_embedded_sampler_key { - /** No need to track binding elements for embedded samplers as : - * - * VUID-VkDescriptorSetLayoutBinding-flags-08006: - * - * "If VkDescriptorSetLayoutCreateInfo:flags contains - * VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT, - * descriptorCount must: less than or equal to 1" - * - * The following struct can be safely hash as it doesn't include in - * address/offset. - */ - uint32_t sampler[4]; - uint32_t color[4]; -}; - struct anv_descriptor_set_layout_sampler { /* Immutable sampler used to populate descriptor sets on allocation */ struct anv_sampler *immutable_sampler; @@ -3450,84 +3603,6 @@ anv_descriptor_set_write_template(struct anv_device *device, const struct vk_descriptor_update_template *template, const void *data); -#define ANV_DESCRIPTOR_SET_PER_PRIM_PADDING (UINT8_MAX - 5) -#define ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER (UINT8_MAX - 4) -#define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 3) -#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 2) -#define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 1) -#define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX - -struct anv_pipeline_binding { - /** Index in the descriptor set - * - * This is a flattened index; the descriptor set layout is already taken - * into account. - */ - uint32_t index; - - /** Binding in the descriptor set. Not valid for any of the - * ANV_DESCRIPTOR_SET_* - */ - uint32_t binding; - - /** Offset in the descriptor buffer - * - * Relative to anv_descriptor_set::desc_addr. This is useful for - * ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT, to generate the binding - * table entry. - */ - uint32_t set_offset; - - /** The descriptor set this surface corresponds to. - * - * The special ANV_DESCRIPTOR_SET_* values above indicates that this - * binding is not a normal descriptor set but something else. - */ - uint8_t set; - - union { - /** Plane in the binding index for images */ - uint8_t plane; - - /** Input attachment index (relative to the subpass) */ - uint8_t input_attachment_index; - - /** Dynamic offset index - * - * For dynamic UBOs and SSBOs, relative to set. - */ - uint8_t dynamic_offset_index; - }; -}; - -struct anv_pipeline_embedded_sampler_binding { - /** The descriptor set this sampler belongs to */ - uint8_t set; - - /** The binding in the set this sampler belongs to */ - uint32_t binding; - - /** The data configuring the sampler */ - struct anv_embedded_sampler_key key; -}; - -struct anv_push_range { - /** Index in the descriptor set */ - uint32_t index; - - /** Descriptor set index */ - uint8_t set; - - /** Dynamic offset index (for dynamic UBOs), relative to set. */ - uint8_t dynamic_offset_index; - - /** Start offset in units of 32B */ - uint8_t start; - - /** Range in units of 32B */ - uint8_t length; -}; - struct anv_pipeline_sets_layout { struct anv_device *device; @@ -4909,36 +4984,11 @@ struct anv_event { stage = __builtin_ffs(__tmp) - 1, __tmp; \ __tmp &= ~(1 << (stage))) -struct anv_pipeline_bind_map { - unsigned char surface_sha1[20]; - unsigned char sampler_sha1[20]; - unsigned char push_sha1[20]; - - /* enum anv_descriptor_set_layout_type */ - uint32_t layout_type; - - uint32_t surface_count; - uint32_t sampler_count; - uint32_t embedded_sampler_count; - - struct anv_pipeline_binding * surface_to_descriptor; - struct anv_pipeline_binding * sampler_to_descriptor; - struct anv_pipeline_embedded_sampler_binding* embedded_sampler_to_binding; - BITSET_DECLARE(input_attachments, MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS + 1); - - struct anv_push_range push_ranges[4]; -}; - -struct anv_push_descriptor_info { - /* A bitfield of descriptors used. */ - uint32_t used_descriptors; - - /* A bitfield of UBOs bindings fully promoted to push constants. */ - uint32_t fully_promoted_ubo_descriptors; - - /* A bitfield with one bit set indicating the push descriptor set used. */ - uint8_t push_set_buffer; -}; +#define anv_foreach_vk_stage(stage, stage_bits) \ + for (VkShaderStageFlags stage, \ + __tmp = (stage_bits & ANV_VK_STAGE_MASK); \ + stage = BITFIELD_BIT(__builtin_ffs(__tmp) - 1), __tmp; \ + __tmp &= ~(stage)) struct anv_shader_upload_params { mesa_shader_stage stage; @@ -5046,6 +5096,8 @@ enum anv_pipeline_type { ANV_PIPELINE_RAY_TRACING, }; +void anv_shader_init_uuid(struct anv_physical_device *device); + struct anv_pipeline { struct vk_pipeline vk; diff --git a/src/intel/vulkan/anv_shader.c b/src/intel/vulkan/anv_shader.c new file mode 100644 index 00000000000..3b3a670a822 --- /dev/null +++ b/src/intel/vulkan/anv_shader.c @@ -0,0 +1,427 @@ +/* Copyright © 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "anv_private.h" +#include "anv_shader.h" + +#include "nir/nir_serialize.h" + +static void +anv_shader_destroy(struct vk_device *vk_device, + struct vk_shader *vk_shader, + const VkAllocationCallbacks *pAllocator) +{ + struct anv_device *device = + container_of(vk_device, struct anv_device, vk); + struct anv_shader *shader = + container_of(vk_shader, struct anv_shader, vk); + + for (uint32_t i = 0; i < shader->bind_map.embedded_sampler_count; i++) + anv_embedded_sampler_unref(device, shader->embedded_samplers[i]); + + anv_state_pool_free(&device->instruction_state_pool, shader->kernel); + vk_shader_free(vk_device, pAllocator, vk_shader); +} + +VkResult +anv_shader_deserialize(struct vk_device *vk_device, + struct blob_reader *blob, + uint32_t binary_version, + const VkAllocationCallbacks* pAllocator, + struct vk_shader **shader_out) +{ + struct anv_device *device = container_of(vk_device, struct anv_device, vk); + + struct anv_shader_data data = {}; + + mesa_shader_stage stage = blob_read_uint32(blob); + + uint32_t code_len = blob_read_uint32(blob); + data.code = (void *)blob_read_bytes(blob, code_len); + + blob_copy_bytes(blob, &data.prog_data, brw_prog_data_size(stage)); + + data.prog_data.base.relocs = + blob_read_bytes(blob, data.prog_data.base.num_relocs * + sizeof(data.prog_data.base.relocs[0])); + + data.num_stats = blob_read_uint32(blob); + blob_copy_bytes(blob, data.stats, data.num_stats * sizeof(data.stats[0])); + + uint32_t xfb_size = blob_read_uint32(blob); + if (xfb_size) + data.xfb_info = blob_read_bytes(blob, xfb_size); + + data.instance_multiplier = blob_read_uint32(blob); + + data.push_desc_info.used_descriptors = blob_read_uint32(blob); + data.push_desc_info.fully_promoted_ubo_descriptors = blob_read_uint32(blob); + data.push_desc_info.push_set_buffer = blob_read_uint8(blob); + + blob_copy_bytes(blob, data.bind_map.surface_sha1, sizeof(data.bind_map.surface_sha1)); + blob_copy_bytes(blob, data.bind_map.sampler_sha1, sizeof(data.bind_map.sampler_sha1)); + blob_copy_bytes(blob, data.bind_map.push_sha1, sizeof(data.bind_map.push_sha1)); + data.bind_map.layout_type = blob_read_uint32(blob); + data.bind_map.surface_count = blob_read_uint32(blob); + data.bind_map.sampler_count = blob_read_uint32(blob); + data.bind_map.embedded_sampler_count = blob_read_uint32(blob); + data.bind_map.surface_to_descriptor = (void *) + blob_read_bytes(blob, data.bind_map.surface_count * + sizeof(*data.bind_map.surface_to_descriptor)); + data.bind_map.sampler_to_descriptor = (void *) + blob_read_bytes(blob, data.bind_map.sampler_count * + sizeof(*data.bind_map.sampler_to_descriptor)); + data.bind_map.embedded_sampler_to_binding = (void *) + blob_read_bytes(blob, data.bind_map.embedded_sampler_count * + sizeof(*data.bind_map.embedded_sampler_to_binding)); + blob_copy_bytes(blob, data.bind_map.input_attachments, + sizeof(data.bind_map.input_attachments)); + blob_copy_bytes(blob, data.bind_map.push_ranges, sizeof(data.bind_map.push_ranges)); + + if (blob->overrun) + return vk_error(device, VK_ERROR_UNKNOWN); + + VkResult result = + anv_shader_create(device, stage, &data, pAllocator, shader_out); + + return result; +} + +static bool +anv_shader_serialize(struct vk_device *device, + const struct vk_shader *vk_shader, + struct blob *blob) +{ + struct anv_shader *shader = + container_of(vk_shader, struct anv_shader, vk); + + blob_write_uint32(blob, vk_shader->stage); + + blob_write_uint32(blob, shader->prog_data->program_size); + blob_write_bytes(blob, shader->kernel.map, + shader->prog_data->program_size); + + union brw_any_prog_data prog_data; + memcpy(&prog_data, shader->prog_data, brw_prog_data_size(vk_shader->stage)); + prog_data.base.relocs = NULL; + prog_data.base.param = NULL; + + blob_write_bytes(blob, &prog_data, brw_prog_data_size(vk_shader->stage)); + + blob_write_bytes(blob, shader->prog_data->relocs, + shader->prog_data->num_relocs * + sizeof(shader->prog_data->relocs[0])); + + blob_write_uint32(blob, shader->num_stats); + blob_write_bytes(blob, shader->stats, + shader->num_stats * sizeof(shader->stats[0])); + + if (shader->xfb_info) { + uint32_t xfb_info_size = + nir_xfb_info_size(shader->xfb_info->output_count); + blob_write_uint32(blob, xfb_info_size); + blob_write_bytes(blob, shader->xfb_info, xfb_info_size); + } else { + blob_write_uint32(blob, 0); + } + + blob_write_uint32(blob, shader->instance_multiplier); + + blob_write_uint32(blob, shader->push_desc_info.used_descriptors); + blob_write_uint32(blob, shader->push_desc_info.fully_promoted_ubo_descriptors); + blob_write_uint8(blob, shader->push_desc_info.push_set_buffer); + + blob_write_bytes(blob, shader->bind_map.surface_sha1, + sizeof(shader->bind_map.surface_sha1)); + blob_write_bytes(blob, shader->bind_map.sampler_sha1, + sizeof(shader->bind_map.sampler_sha1)); + blob_write_bytes(blob, shader->bind_map.push_sha1, + sizeof(shader->bind_map.push_sha1)); + blob_write_uint32(blob, shader->bind_map.layout_type); + blob_write_uint32(blob, shader->bind_map.surface_count); + blob_write_uint32(blob, shader->bind_map.sampler_count); + blob_write_uint32(blob, shader->bind_map.embedded_sampler_count); + blob_write_bytes(blob, shader->bind_map.surface_to_descriptor, + shader->bind_map.surface_count * + sizeof(*shader->bind_map.surface_to_descriptor)); + blob_write_bytes(blob, shader->bind_map.sampler_to_descriptor, + shader->bind_map.sampler_count * + sizeof(*shader->bind_map.sampler_to_descriptor)); + blob_write_bytes(blob, shader->bind_map.embedded_sampler_to_binding, + shader->bind_map.embedded_sampler_count * + sizeof(*shader->bind_map.embedded_sampler_to_binding)); + blob_write_bytes(blob, shader->bind_map.input_attachments, + sizeof(shader->bind_map.input_attachments)); + blob_write_bytes(blob, shader->bind_map.push_ranges, + sizeof(shader->bind_map.push_ranges)); + + return !blob->out_of_memory; +} + +static VkResult +anv_shader_get_executable_properties(struct vk_device *device, + const struct vk_shader *shader, + uint32_t *executable_count, + VkPipelineExecutablePropertiesKHR *properties) +{ + return VK_SUCCESS; +} + +static VkResult +anv_shader_get_executable_statistics(struct vk_device *device, + const struct vk_shader *shader, + uint32_t executable_index, + uint32_t *statistic_count, + VkPipelineExecutableStatisticKHR *statistics) +{ + return VK_SUCCESS; +} + +static VkResult +anv_shader_get_executable_internal_representations( + struct vk_device *device, + const struct vk_shader *shader, + uint32_t executable_index, + uint32_t *internal_representation_count, + VkPipelineExecutableInternalRepresentationKHR *internal_representations) +{ + return VK_SUCCESS; +} + +static struct vk_shader_ops anv_shader_ops = { + .destroy = anv_shader_destroy, + .serialize = anv_shader_serialize, + .get_executable_properties = anv_shader_get_executable_properties, + .get_executable_statistics = anv_shader_get_executable_statistics, + .get_executable_internal_representations = + anv_shader_get_executable_internal_representations, +}; + +static VkResult +anv_shader_reloc(struct anv_device *device, + void *code, + struct anv_shader *shader, + const VkAllocationCallbacks *pAllocator) +{ + uint64_t shader_data_addr = + device->physical->va.instruction_state_pool.addr + + shader->kernel.offset + + shader->prog_data->const_data_offset; + + const uint32_t max_relocs = + BRW_SHADER_RELOC_EMBEDDED_SAMPLER_HANDLE + + shader->bind_map.embedded_sampler_count; + uint32_t rv_count = 0; + struct brw_shader_reloc_value *reloc_values = + vk_zalloc2(&device->vk.alloc, pAllocator, + sizeof(struct brw_shader_reloc_value) * max_relocs, 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (reloc_values == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + assert((device->physical->va.dynamic_visible_pool.addr & 0xffffffff) == 0); + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_INSTRUCTION_BASE_ADDR_HIGH, + .value = device->physical->va.instruction_state_pool.addr >> 32, + }; + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH, + .value = device->physical->va.dynamic_visible_pool.addr >> 32, + }; + assert((device->physical->va.indirect_descriptor_pool.addr & 0xffffffff) == 0); + assert((device->physical->va.internal_surface_state_pool.addr & 0xffffffff) == 0); + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH, + .value = device->physical->indirect_descriptors ? + (device->physical->va.indirect_descriptor_pool.addr >> 32) : + (device->physical->va.internal_surface_state_pool.addr >> 32), + }; + assert((device->physical->va.instruction_state_pool.addr & 0xffffffff) == 0); + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW, + .value = shader_data_addr, + }; + assert((device->physical->va.instruction_state_pool.addr & 0xffffffff) == 0); + assert(shader_data_addr >> 32 == device->physical->va.instruction_state_pool.addr >> 32); + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH, + .value = device->physical->va.instruction_state_pool.addr >> 32, + }; + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_SHADER_START_OFFSET, + .value = shader->kernel.offset, + }; + if (brw_shader_stage_is_bindless(shader->vk.stage)) { + const struct brw_bs_prog_data *bs_prog_data = + brw_bs_prog_data_const(shader->prog_data); + uint64_t resume_sbt_addr = + device->physical->va.instruction_state_pool.addr + + shader->kernel.offset + + bs_prog_data->resume_sbt_offset; + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW, + .value = resume_sbt_addr, + }; + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH, + .value = resume_sbt_addr >> 32, + }; + } + + if (INTEL_DEBUG(DEBUG_SHADER_PRINT)) { + struct anv_bo *bo = device->printf.bo; + assert(bo != NULL); + + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_PRINTF_BUFFER_ADDR_LOW, + .value = bo->offset & 0xffffffff, + }; + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_PRINTF_BUFFER_ADDR_HIGH, + .value = bo->offset >> 32, + }; + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_PRINTF_BUFFER_SIZE, + .value = anv_printf_buffer_size(), + }; + } + + for (uint32_t i = 0; i < shader->bind_map.embedded_sampler_count; i++) { + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_EMBEDDED_SAMPLER_HANDLE + i, + .value = shader->embedded_samplers[i]->sampler_state.offset, + }; + } + + assert(rv_count <= max_relocs); + + brw_write_shader_relocs(&device->physical->compiler->isa, + code, shader->prog_data, + reloc_values, rv_count); + + vk_free2(&device->vk.alloc, pAllocator, reloc_values); + + return VK_SUCCESS; +} + +VkResult +anv_shader_create(struct anv_device *device, + mesa_shader_stage stage, + struct anv_shader_data *shader_data, + const VkAllocationCallbacks *pAllocator, + struct vk_shader **shader_out) +{ + /* We never need this at runtime */ + shader_data->prog_data.base.param = NULL; + + VK_MULTIALLOC(ma); + VK_MULTIALLOC_DECL(&ma, struct anv_shader, shader, 1); + VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, brw_prog_key_size(stage)); + VK_MULTIALLOC_DECL_SIZE(&ma, struct brw_stage_prog_data, prog_data, + brw_prog_data_size(stage)); + VK_MULTIALLOC_DECL(&ma, struct brw_shader_reloc, prog_data_relocs, + shader_data->prog_data.base.num_relocs); + + VK_MULTIALLOC_DECL_SIZE(&ma, nir_xfb_info, xfb_info, + shader_data->xfb_info == NULL ? 0 : + nir_xfb_info_size(shader_data->xfb_info->output_count)); + + VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, surface_to_descriptor, + shader_data->bind_map.surface_count); + VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, sampler_to_descriptor, + shader_data->bind_map.sampler_count); + VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_embedded_sampler_binding, + embedded_sampler_to_binding, + shader_data->bind_map.embedded_sampler_count); + VK_MULTIALLOC_DECL(&ma, struct anv_embedded_sampler *, embedded_samplers, + shader_data->bind_map.embedded_sampler_count); + + if (!vk_shader_multizalloc(&device->vk, &ma, &anv_shader_ops, + stage, pAllocator)) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + VkResult result; + if (shader_data->bind_map.embedded_sampler_count > 0) { + shader->embedded_samplers = embedded_samplers; + result = anv_device_get_embedded_samplers( + device, embedded_samplers, &shader_data->bind_map); + if (result != VK_SUCCESS) + goto error_shader; + } + + shader->kernel = + anv_state_pool_alloc(&device->instruction_state_pool, + shader_data->prog_data.base.program_size, 64); + if (shader->kernel.alloc_size == 0) { + result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto error_embedded_samplers; + } + + memcpy(prog_data, &shader_data->prog_data, brw_prog_data_size(stage)); + + typed_memcpy(prog_data_relocs, + shader_data->prog_data.base.relocs, + shader_data->prog_data.base.num_relocs); + prog_data->relocs = prog_data_relocs; + + shader->prog_data = prog_data; + + shader->num_stats = shader_data->num_stats; + typed_memcpy(shader->stats, shader_data->stats, shader_data->num_stats); + + if (shader_data->xfb_info) { + *xfb_info = *shader_data->xfb_info; + typed_memcpy(xfb_info->outputs, shader_data->xfb_info->outputs, + shader_data->xfb_info->output_count); + shader->xfb_info = xfb_info; + } + + typed_memcpy(&shader->push_desc_info, &shader_data->push_desc_info, 1); + + typed_memcpy(&shader->bind_map, &shader_data->bind_map, 1); + typed_memcpy(surface_to_descriptor, + shader_data->bind_map.surface_to_descriptor, + shader_data->bind_map.surface_count); + typed_memcpy(sampler_to_descriptor, + shader_data->bind_map.sampler_to_descriptor, + shader_data->bind_map.sampler_count); + typed_memcpy(embedded_sampler_to_binding, + shader_data->bind_map.embedded_sampler_to_binding, + shader_data->bind_map.embedded_sampler_count); + typed_memcpy(shader->bind_map.input_attachments, + shader_data->bind_map.input_attachments, + ARRAY_SIZE(shader_data->bind_map.input_attachments)); + shader->bind_map.surface_to_descriptor = surface_to_descriptor; + shader->bind_map.sampler_to_descriptor = sampler_to_descriptor; + shader->bind_map.embedded_sampler_to_binding = embedded_sampler_to_binding; + + shader->instance_multiplier = shader_data->instance_multiplier; + + result = anv_shader_reloc(device, shader_data->code, shader, pAllocator); + if (result != VK_SUCCESS) + goto error_embedded_samplers; + + memcpy(shader->kernel.map, shader_data->code, + shader_data->prog_data.base.program_size); + + if (mesa_shader_stage_is_rt(shader->vk.stage)) { + const struct brw_bs_prog_data *bs_prog_data = + (const struct brw_bs_prog_data *)shader->prog_data; + shader->vk.stack_size = bs_prog_data->max_stack_size; + } + shader->vk.scratch_size = shader->prog_data->total_scratch; + shader->vk.ray_queries = shader->prog_data->ray_queries; + + *shader_out = &shader->vk; + + return VK_SUCCESS; + + error_embedded_samplers: + for (uint32_t s = 0; s < shader->bind_map.embedded_sampler_count; s++) + anv_embedded_sampler_unref(device, shader->embedded_samplers[s]); + anv_state_pool_free(&device->instruction_state_pool, shader->kernel); + error_shader: + vk_shader_free(&device->vk, pAllocator, &shader->vk); + return result; +} diff --git a/src/intel/vulkan/anv_shader.h b/src/intel/vulkan/anv_shader.h new file mode 100644 index 00000000000..96b6ab0fb40 --- /dev/null +++ b/src/intel/vulkan/anv_shader.h @@ -0,0 +1,59 @@ +/* Copyright © 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "anv_private.h" +#include "anv_nir.h" + +#include "nir/nir_xfb_info.h" + +struct anv_shader_data { + struct vk_shader_compile_info *info; + + struct vk_shader **shader_out; + + union brw_any_prog_key key; + uint32_t key_size; + + union brw_any_prog_data prog_data; + + uint32_t source_hash; + + const nir_xfb_info *xfb_info; + + uint32_t num_stats; + struct brw_compile_stats stats[3]; + char *disasm[3]; + + bool use_primitive_replication; + uint32_t instance_multiplier; + + /* For fragment shaders only */ + struct brw_mue_map *mue_map; + + struct anv_push_descriptor_info push_desc_info; + + struct anv_pipeline_bind_map bind_map; + + struct anv_pipeline_push_map push_map; + + bool uses_bt_for_push_descs; + + unsigned *code; +}; + +VkResult anv_shader_create(struct anv_device *device, + mesa_shader_stage stage, + struct anv_shader_data *shader_data, + const VkAllocationCallbacks *pAllocator, + struct vk_shader **shader_out); + +VkResult anv_shader_deserialize(struct vk_device *device, + struct blob_reader *blob, + uint32_t binary_version, + const VkAllocationCallbacks* pAllocator, + struct vk_shader **shader_out); + +extern struct vk_device_shader_ops anv_device_shader_ops; diff --git a/src/intel/vulkan/anv_shader_compile.c b/src/intel/vulkan/anv_shader_compile.c new file mode 100644 index 00000000000..9f4f62f0869 --- /dev/null +++ b/src/intel/vulkan/anv_shader_compile.c @@ -0,0 +1,1864 @@ +/* Copyright © 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "anv_private.h" +#include "anv_nir.h" +#include "anv_shader.h" + +#include "vk_nir_convert_ycbcr.h" +#include "vk_pipeline.h" + +#include "common/intel_compute_slm.h" +#include "common/intel_l3_config.h" + +#include "compiler/brw_nir.h" +#include "compiler/brw_nir_rt.h" +#include "compiler/intel_nir.h" + +static enum anv_descriptor_set_layout_type +set_layouts_get_layout_type(struct anv_descriptor_set_layout * const *set_layouts, + uint32_t set_layout_count) +{ + for (uint32_t s = 0; s < set_layout_count; s++) { + if (set_layouts[s]) { + return set_layouts[s]->type; + } + } + + return ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_UNKNOWN; +} + +void +anv_shader_init_uuid(struct anv_physical_device *device) +{ + /* We should include any parameter here that will change the compiler's + * output. Mostly it's workarounds, but there is also settings for using + * indirect descriptors (a different binding model). + * + * The fp64 workaround is skipped because although it changes the + * compiler's output, not having that workaroung enabled with an app + * expecting fp64 support will just crash in the backend. + */ + struct mesa_sha1 ctx; + _mesa_sha1_init(&ctx); + + const bool indirect_descriptors = device->indirect_descriptors; + _mesa_sha1_update(&ctx, &indirect_descriptors, sizeof(indirect_descriptors)); + + const int spilling_rate = device->compiler->spilling_rate; + _mesa_sha1_update(&ctx, &spilling_rate, sizeof(spilling_rate)); + + const uint8_t afs = device->instance->assume_full_subgroups; + _mesa_sha1_update(&ctx, &afs, sizeof(afs)); + + const bool afswb = device->instance->assume_full_subgroups_with_barrier; + _mesa_sha1_update(&ctx, &afswb, sizeof(afswb)); + + const bool afs_shm = device->instance->assume_full_subgroups_with_shared_memory; + _mesa_sha1_update(&ctx, &afs_shm, sizeof(afs_shm)); + + const bool erwf = device->instance->emulate_read_without_format; + _mesa_sha1_update(&ctx, &erwf, sizeof(erwf)); + + const bool lttd = device->instance->lower_terminate_to_discard; + _mesa_sha1_update(&ctx, <td, sizeof(lttd)); + + const bool large_wg_wa = + device->instance->large_workgroup_non_coherent_image_workaround; + _mesa_sha1_update(&ctx, &large_wg_wa, sizeof(large_wg_wa)); + + uint8_t sha1[20]; + _mesa_sha1_final(&ctx, sha1); + memcpy(device->shader_binary_uuid, sha1, sizeof(device->shader_binary_uuid)); +} + +static const struct nir_shader_compiler_options * +anv_shader_get_nir_options(struct vk_physical_device *device, + mesa_shader_stage stage, + const struct vk_pipeline_robustness_state *rs) + +{ + struct anv_physical_device *pdevice = + container_of(device, struct anv_physical_device, vk); + const struct brw_compiler *compiler = pdevice->compiler; + + return compiler->nir_options[stage]; +} + +static struct spirv_to_nir_options +anv_shader_get_spirv_options(struct vk_physical_device *device, + mesa_shader_stage stage, + const struct vk_pipeline_robustness_state *rs) +{ + struct anv_physical_device *pdevice = + container_of(device, struct anv_physical_device, vk); + enum brw_robustness_flags robust_flags = anv_get_robust_flags(rs); + + return (struct spirv_to_nir_options) { + .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_flags), + .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_flags), + .phys_ssbo_addr_format = nir_address_format_64bit_global, + .push_const_addr_format = nir_address_format_logical, + + /* TODO: Consider changing this to an address format that has the NULL + * pointer equals to 0. That might be a better format to play nice + * with certain code / code generators. + */ + .shared_addr_format = nir_address_format_32bit_offset, + + .min_ubo_alignment = ANV_UBO_ALIGNMENT, + .min_ssbo_alignment = ANV_SSBO_ALIGNMENT, + + .workarounds = { + .lower_terminate_to_discard = pdevice->instance->lower_terminate_to_discard, + }, + }; +} + +static void +anv_shader_preprocess_nir(struct vk_physical_device *device, + nir_shader *nir, + const struct vk_pipeline_robustness_state *rs) +{ + struct anv_physical_device *pdevice = + container_of(device, struct anv_physical_device, vk); + const struct brw_compiler *compiler = pdevice->compiler; + + NIR_PASS(_, nir, nir_lower_io_vars_to_temporaries, + nir_shader_get_entrypoint(nir), true, false); + + const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = { + .point_coord = true, + }; + NIR_PASS(_, nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings); + + const nir_opt_access_options opt_access_options = { + .is_vulkan = true, + }; + NIR_PASS(_, nir, nir_opt_access, &opt_access_options); + + struct brw_nir_compiler_opts opts = { + .robust_image_access = rs->images == VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS || + rs->images == VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT, + }; + brw_preprocess_nir(compiler, nir, &opts); + + NIR_PASS(_, nir, nir_opt_barrier_modes); + + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); +} + +static void +populate_base_prog_key(struct brw_base_prog_key *key, + const struct vk_physical_device *device, + const struct vk_pipeline_robustness_state *rs) +{ + const struct anv_physical_device *pdevice = + container_of(device, const struct anv_physical_device, vk); + + /* We can avoid including this for hashing because the runtime already + * hashes that information. We just put it here for at compile time. + */ + if (rs != NULL) + key->robust_flags = anv_get_robust_flags(rs); + key->limit_trig_input_range = pdevice->instance->limit_trig_input_range; +} + +static void +populate_base_gfx_prog_key(struct brw_base_prog_key *key, + const struct vk_physical_device *device, + const struct vk_pipeline_robustness_state *rs, + const struct vk_graphics_pipeline_state *gfx_state, + VkShaderStageFlags link_stages) +{ + const struct anv_physical_device *pdevice = + container_of(device, const struct anv_physical_device, vk); + + populate_base_prog_key(key, device, rs); + + key->view_mask = (gfx_state && gfx_state->rp) ? gfx_state->rp->view_mask : 0; + + key->vue_layout = + (util_bitcount(link_stages) > 1 && (link_stages & VK_SHADER_STAGE_FRAGMENT_BIT)) ? + INTEL_VUE_LAYOUT_FIXED : + pdevice->info.verx10 >= 125 ? INTEL_VUE_LAYOUT_SEPARATE_MESH : + INTEL_VUE_LAYOUT_SEPARATE; +} + +static void +populate_vs_prog_key(struct brw_vs_prog_key *key, + const struct vk_physical_device *device, + const struct vk_pipeline_robustness_state *rs, + const struct vk_graphics_pipeline_state *state, + VkShaderStageFlags link_stages) +{ + const struct anv_physical_device *pdevice = + container_of(device, const struct anv_physical_device, vk); + + populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages); + + key->vf_component_packing = pdevice->instance->vf_component_packing; +} + +static void +populate_tcs_prog_key(struct brw_tcs_prog_key *key, + const struct vk_physical_device *device, + const struct vk_pipeline_robustness_state *rs, + const struct vk_graphics_pipeline_state *state, + VkShaderStageFlags link_stages) +{ + populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages); + + if (state && state->ts && + !BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) + key->input_vertices = state->ts->patch_control_points; + + key->separate_tess_vue_layout = + !(link_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT); +} + +static void +populate_tes_prog_key(struct brw_tes_prog_key *key, + const struct vk_physical_device *device, + const struct vk_pipeline_robustness_state *rs, + const struct vk_graphics_pipeline_state *state, + VkShaderStageFlags link_stages) +{ + populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages); + + key->separate_tess_vue_layout = + !(link_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT); +} + +static void +populate_gs_prog_key(struct brw_gs_prog_key *key, + const struct vk_physical_device *device, + const struct vk_pipeline_robustness_state *rs, + const struct vk_graphics_pipeline_state *state, + VkShaderStageFlags link_stages) +{ + populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages); +} + +static void +populate_task_prog_key(struct brw_task_prog_key *key, + const struct vk_physical_device *device, + const struct vk_pipeline_robustness_state *rs, + const struct vk_graphics_pipeline_state *state, + VkShaderStageFlags link_stages) +{ + populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages); + + key->base.uses_inline_push_addr = true; +} + +static void +populate_mesh_prog_key(struct brw_mesh_prog_key *key, + const struct vk_physical_device *device, + const struct vk_pipeline_robustness_state *rs, + const struct vk_graphics_pipeline_state *state, + VkShaderStageFlags link_stages) +{ + populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages); + + key->base.uses_inline_push_addr = true; +} + +static bool +pipeline_has_coarse_pixel(const struct vk_graphics_pipeline_state *state) +{ + if (state == NULL) + return true; + + /* The Vulkan 1.2.199 spec says: + * + * "If any of the following conditions are met, Cxy' must be set to + * {1,1}: + * + * * If Sample Shading is enabled. + * * [...]" + * + * And "sample shading" is defined as follows: + * + * "Sample shading is enabled for a graphics pipeline: + * + * * If the interface of the fragment shader entry point of the + * graphics pipeline includes an input variable decorated with + * SampleId or SamplePosition. In this case minSampleShadingFactor + * takes the value 1.0. + * + * * Else if the sampleShadingEnable member of the + * VkPipelineMultisampleStateCreateInfo structure specified when + * creating the graphics pipeline is set to VK_TRUE. In this case + * minSampleShadingFactor takes the value of + * VkPipelineMultisampleStateCreateInfo::minSampleShading. + * + * Otherwise, sample shading is considered disabled." + * + * The first bullet above is handled by the back-end compiler because those + * inputs both force per-sample dispatch. The second bullet is handled + * here. Note that this sample shading being enabled has nothing to do + * with minSampleShading. + */ + if (state->ms != NULL && state->ms->sample_shading_enable) + return false; + + /* Not dynamic & pipeline has a 1x1 fragment shading rate with no + * possibility for element of the pipeline to change the value or fragment + * shading rate not specified at all. + */ + if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_FSR) && + (state->fsr == NULL || + (state->fsr->fragment_size.width <= 1 && + state->fsr->fragment_size.height <= 1 && + state->fsr->combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR && + state->fsr->combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR))) + return false; + + return true; +} + +static uint32_t +rp_color_mask(const struct vk_graphics_pipeline_state *state) +{ + if (state == NULL || state->rp == NULL || + !vk_render_pass_state_has_attachment_info(state->rp)) + return ((1u << MAX_RTS) - 1); + + assert(state->rp->color_attachment_count <= MAX_RTS); + + uint32_t color_mask = 0; + for (uint32_t i = 0; i < state->rp->color_attachment_count; i++) { + if (state->rp->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) + color_mask |= BITFIELD_BIT(i); + } + + return color_mask; +} + +static void +populate_wm_prog_key(struct brw_wm_prog_key *key, + const struct vk_physical_device *device, + const struct vk_pipeline_robustness_state *rs, + const struct vk_graphics_pipeline_state *state, + VkShaderStageFlags link_stages) +{ + const struct anv_physical_device *pdevice = + container_of(device, const struct anv_physical_device, vk); + + populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages); + + /* Consider all inputs as valid until look at the NIR variables. */ + key->color_outputs_valid = rp_color_mask(state); + key->nr_color_regions = util_last_bit(key->color_outputs_valid); + + /* To reduce possible shader recompilations we would need to know if + * there is a SampleMask output variable to compute if we should emit + * code to workaround the issue that hardware disables alpha to coverage + * when there is SampleMask output. + * + * If the pipeline we compile the fragment shader in includes the output + * interface, then we can be sure whether alpha_coverage is enabled or not. + * If we don't have that output interface, then we have to compile the + * shader with some conditionals. + */ + if (state != NULL && state->ms != NULL) { + /* VUID-VkGraphicsPipelineCreateInfo-rasterizerDiscardEnable-00751: + * + * "If the pipeline is being created with fragment shader state, + * pMultisampleState must be a valid pointer to a valid + * VkPipelineMultisampleStateCreateInfo structure" + * + * It's also required for the fragment output interface. + */ + key->multisample_fbo = + BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ? + INTEL_SOMETIMES : + state->ms->rasterization_samples > 1 ? INTEL_ALWAYS : INTEL_NEVER; + key->persample_interp = + BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ? + INTEL_SOMETIMES : + (state->ms->sample_shading_enable && + (state->ms->min_sample_shading * state->ms->rasterization_samples) > 1) ? + INTEL_ALWAYS : INTEL_NEVER; + key->alpha_to_coverage = + BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ? + INTEL_SOMETIMES : + (state->ms->alpha_to_coverage_enable ? INTEL_ALWAYS : INTEL_NEVER); + + /* TODO: We should make this dynamic */ + if (pdevice->instance->sample_mask_out_opengl_behaviour) + key->ignore_sample_mask_out = !key->multisample_fbo; + } else { + /* Consider all inputs as valid until we look at the NIR variables. */ + key->color_outputs_valid = BITFIELD_MASK(MAX_RTS); + key->nr_color_regions = MAX_RTS; + + key->alpha_to_coverage = INTEL_SOMETIMES; + key->multisample_fbo = INTEL_SOMETIMES; + key->persample_interp = INTEL_SOMETIMES; + } + + if (pdevice->info.verx10 >= 200) { + if (state != NULL && state->rs != NULL) { + key->provoking_vertex_last = + BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX) ? + INTEL_SOMETIMES : + state->rs->provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ? + INTEL_ALWAYS : INTEL_NEVER; + } else { + key->provoking_vertex_last = INTEL_SOMETIMES; + } + } else { + /* Pre-Xe2 we don't care about this at all, make sure it's always set to + * NEVER to avoid it influencing the push constant. + */ + key->provoking_vertex_last = INTEL_NEVER; + } + + key->mesh_input = + (link_stages & VK_SHADER_STAGE_VERTEX_BIT) ? INTEL_NEVER : + (link_stages & VK_SHADER_STAGE_MESH_BIT_EXT) ? INTEL_ALWAYS : + pdevice->info.verx10 >= 125 ? INTEL_SOMETIMES : INTEL_NEVER; + + if (state && state->ms) { + key->min_sample_shading = state->ms->min_sample_shading; + key->api_sample_shading = state->ms->sample_shading_enable; + } + + key->coarse_pixel = pipeline_has_coarse_pixel(state); + + key->null_push_constant_tbimr_workaround = + pdevice->info.needs_null_push_constant_tbimr_workaround; +} + +static void +populate_cs_prog_key(struct brw_cs_prog_key *key, + const struct vk_physical_device *device, + const struct vk_pipeline_robustness_state *rs, + bool lower_unaligned_dispatch) +{ + const struct anv_physical_device *pdevice = + container_of(device, const struct anv_physical_device, vk); + + populate_base_prog_key(&key->base, device, rs); + + key->base.uses_inline_push_addr = pdevice->info.verx10 >= 125; + key->lower_unaligned_dispatch = lower_unaligned_dispatch; +} + +static void +populate_bs_prog_key(struct brw_bs_prog_key *key, + const struct vk_physical_device *device, + const struct vk_pipeline_robustness_state *rs, + VkPipelineCreateFlags2KHR flags) +{ + populate_base_prog_key(&key->base, device, rs); + + uint32_t ray_flags = 0; + const bool rt_skip_triangles = + flags & VK_PIPELINE_CREATE_2_RAY_TRACING_SKIP_TRIANGLES_BIT_KHR; + const bool rt_skip_aabbs = + flags & VK_PIPELINE_CREATE_2_RAY_TRACING_SKIP_AABBS_BIT_KHR; + assert(!(rt_skip_triangles && rt_skip_aabbs)); + if (rt_skip_triangles) + ray_flags |= BRW_RT_RAY_FLAG_SKIP_TRIANGLES; + else if (rt_skip_aabbs) + ray_flags |= BRW_RT_RAY_FLAG_SKIP_AABBS; + + key->pipeline_ray_flags = ray_flags; +} + +static void +anv_shader_hash_state(struct vk_physical_device *device, + const struct vk_graphics_pipeline_state *state, + const struct vk_features *enabled_features, + VkShaderStageFlags stages, + blake3_hash blake3_out) +{ + struct mesa_blake3 blake3_ctx; + _mesa_blake3_init(&blake3_ctx); + + anv_foreach_vk_stage(stage, stages) { + union brw_any_prog_key key; + memset(&key, 0, sizeof(key)); + + switch (stage) { + case VK_SHADER_STAGE_VERTEX_BIT: + populate_vs_prog_key(&key.vs, device, NULL, state, stages); + _mesa_blake3_update(&blake3_ctx, &key.vs, sizeof(key.vs)); + break; + case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT: + populate_tcs_prog_key(&key.tcs, device, NULL, state, stages); + _mesa_blake3_update(&blake3_ctx, &key.tcs, sizeof(key.tcs)); + break; + case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT: + populate_tes_prog_key(&key.tes, device, NULL, state, stages); + _mesa_blake3_update(&blake3_ctx, &key.tes, sizeof(key.tes)); + break; + case VK_SHADER_STAGE_GEOMETRY_BIT: + populate_gs_prog_key(&key.gs, device, NULL, state, stages); + _mesa_blake3_update(&blake3_ctx, &key.gs, sizeof(key.gs)); + break; + case VK_SHADER_STAGE_TASK_BIT_EXT: + populate_task_prog_key(&key.task, device, NULL, state, stages); + _mesa_blake3_update(&blake3_ctx, &key.task, sizeof(key.task)); + break; + case VK_SHADER_STAGE_MESH_BIT_EXT: + populate_mesh_prog_key(&key.mesh, device, NULL, state, stages); + _mesa_blake3_update(&blake3_ctx, &key.mesh, sizeof(key.mesh)); + break; + case VK_SHADER_STAGE_FRAGMENT_BIT: + populate_wm_prog_key(&key.wm, device, NULL, state, stages); + _mesa_blake3_update(&blake3_ctx, &key.wm, sizeof(key.wm)); + break; + case VK_SHADER_STAGE_COMPUTE_BIT: + populate_cs_prog_key(&key.cs, device, NULL, false); + _mesa_blake3_update(&blake3_ctx, &key.cs, sizeof(key.cs)); + break; + default: + UNREACHABLE("Invalid stage"); + } + } + + _mesa_blake3_final(&blake3_ctx, blake3_out); +} + +static void +merge_tess_info(struct shader_info *tes_info, + const struct shader_info *tcs_info) +{ + /* The Vulkan 1.0.38 spec, section 21.1 Tessellator says: + * + * "PointMode. Controls generation of points rather than triangles + * or lines. This functionality defaults to disabled, and is + * enabled if either shader stage includes the execution mode. + * + * and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw, + * PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd, + * and OutputVertices, it says: + * + * "One mode must be set in at least one of the tessellation + * shader stages." + * + * So, the fields can be set in either the TCS or TES, but they must + * agree if set in both. Our backend looks at TES, so bitwise-or in + * the values from the TCS. + */ + assert(tcs_info->tess.tcs_vertices_out == 0 || + tes_info->tess.tcs_vertices_out == 0 || + tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out); + tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out; + + assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED || + tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED || + tcs_info->tess.spacing == tes_info->tess.spacing); + tes_info->tess.spacing |= tcs_info->tess.spacing; + + assert(tcs_info->tess._primitive_mode == 0 || + tes_info->tess._primitive_mode == 0 || + tcs_info->tess._primitive_mode == tes_info->tess._primitive_mode); + tes_info->tess._primitive_mode |= tcs_info->tess._primitive_mode; + tes_info->tess.ccw |= tcs_info->tess.ccw; + tes_info->tess.point_mode |= tcs_info->tess.point_mode; +} + +static void +anv_shader_link_tcs(const struct brw_compiler *compiler, + struct brw_tcs_prog_key *key, + struct vk_shader_compile_info *tcs_stage, + struct vk_shader_compile_info *tes_stage) +{ + assert(tes_stage && tes_stage->stage == MESA_SHADER_TESS_EVAL); + + brw_nir_link_shaders(compiler, tcs_stage->nir, tes_stage->nir); + + nir_lower_patch_vertices(tes_stage->nir, + tcs_stage->nir->info.tess.tcs_vertices_out, + NULL); + + /* Copy TCS info into the TES info */ + merge_tess_info(&tes_stage->nir->info, &tcs_stage->nir->info); + + /* Whacking the key after cache lookup is a bit sketchy, but all of + * this comes from the SPIR-V, which is part of the hash used for the + * pipeline cache. So it should be safe. + */ + key->_tes_primitive_mode = tes_stage->nir->info.tess._primitive_mode; +} + +static void +anv_shader_link(const struct brw_compiler *compiler, + struct vk_shader_compile_info *prev_stage, + struct vk_shader_compile_info *next_stage) +{ + brw_nir_link_shaders(compiler, prev_stage->nir, next_stage->nir); +} + +static const struct vk_ycbcr_conversion_state * +lookup_ycbcr_conversion(const void *_stage, uint32_t set, + uint32_t binding, uint32_t array_index) +{ + const struct vk_shader_compile_info *stage = _stage; + + assert(set < MAX_SETS); + const struct anv_descriptor_set_layout *set_layout = + container_of(stage->set_layouts[set], + struct anv_descriptor_set_layout, vk); + + assert(binding < set_layout->binding_count); + const struct anv_descriptor_set_binding_layout *bind_layout = + &set_layout->binding[binding]; + + if (bind_layout->samplers == NULL) + return NULL; + + array_index = MIN2(array_index, bind_layout->array_size - 1); + + const struct anv_descriptor_set_layout_sampler *sampler = + &bind_layout->samplers[array_index]; + + return sampler->has_ycbcr_conversion ? + &sampler->ycbcr_conversion_state : NULL; +} + +static void +anv_fixup_subgroup_size(struct anv_instance *instance, struct shader_info *info) +{ + if (!mesa_shader_stage_uses_workgroup(info->stage)) + return; + + unsigned local_size = info->workgroup_size[0] * + info->workgroup_size[1] * + info->workgroup_size[2]; + + /* Games don't always request full subgroups when they should, + * which can cause bugs, as they may expect bigger size of the + * subgroup than we choose for the execution. + */ + if (instance->assume_full_subgroups && + info->uses_wide_subgroup_intrinsics && + info->subgroup_size == SUBGROUP_SIZE_API_CONSTANT && + local_size && + local_size % BRW_SUBGROUP_SIZE == 0) + info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS; + + /* If the client requests that we dispatch full subgroups but doesn't + * allow us to pick a subgroup size, we have to smash it to the API + * value of 32. Performance will likely be terrible in this case but + * there's nothing we can do about that. The client should have chosen + * a size. + */ + if (info->subgroup_size == SUBGROUP_SIZE_FULL_SUBGROUPS) + info->subgroup_size = + instance->assume_full_subgroups != 0 ? + instance->assume_full_subgroups : BRW_SUBGROUP_SIZE; + + /* Cooperative matrix extension requires that all invocations in a subgroup + * be active. As a result, when the application does not request a specific + * subgroup size, we must use SIMD32. + */ + if (info->stage == MESA_SHADER_COMPUTE && info->cs.has_cooperative_matrix && + info->subgroup_size < SUBGROUP_SIZE_REQUIRE_8) { + info->subgroup_size = BRW_SUBGROUP_SIZE; + } +} + +static void +anv_shader_compile_vs(struct anv_device *device, + void *mem_ctx, + struct anv_shader_data *shader_data, + char **error_str) +{ + const struct brw_compiler *compiler = device->physical->compiler; + nir_shader *nir = shader_data->info->nir; + + shader_data->num_stats = 1; + + struct brw_compile_vs_params params = { + .base = { + .nir = nir, + .stats = shader_data->stats, + .log_data = device, + .mem_ctx = mem_ctx, + .source_hash = shader_data->source_hash, + }, + .key = &shader_data->key.vs, + .prog_data = &shader_data->prog_data.vs, + }; + + shader_data->code = (void *)brw_compile_vs(compiler, ¶ms); + *error_str = params.base.error_str; +} + +static void +anv_shader_compile_tcs(struct anv_device *device, + void *mem_ctx, + struct anv_shader_data *shader_data, + char **error_str) +{ + const struct brw_compiler *compiler = device->physical->compiler; + nir_shader *nir = shader_data->info->nir; + + shader_data->key.tcs.outputs_written = nir->info.outputs_written; + shader_data->key.tcs.patch_outputs_written = nir->info.patch_outputs_written; + + shader_data->num_stats = 1; + + struct brw_compile_tcs_params params = { + .base = { + .nir = nir, + .stats = shader_data->stats, + .log_data = device, + .mem_ctx = mem_ctx, + .source_hash = shader_data->source_hash, + }, + .key = &shader_data->key.tcs, + .prog_data = &shader_data->prog_data.tcs, + }; + + shader_data->code = (void *)brw_compile_tcs(compiler, ¶ms); + *error_str = params.base.error_str; +} + +static void +anv_shader_compile_tes(struct anv_device *device, + void *mem_ctx, + struct anv_shader_data *tes_shader_data, + struct anv_shader_data *tcs_shader_data, + char **error_str) +{ + const struct brw_compiler *compiler = device->physical->compiler; + nir_shader *nir = tes_shader_data->info->nir; + + if (tcs_shader_data) { + tes_shader_data->key.tes.inputs_read = + tcs_shader_data->info->nir->info.outputs_written; + tes_shader_data->key.tes.patch_inputs_read = + tcs_shader_data->info->nir->info.patch_outputs_written; + } + + tes_shader_data->num_stats = 1; + + struct brw_compile_tes_params params = { + .base = { + .nir = nir, + .stats = tes_shader_data->stats, + .log_data = device, + .mem_ctx = mem_ctx, + .source_hash = tes_shader_data->source_hash, + }, + .key = &tes_shader_data->key.tes, + .prog_data = &tes_shader_data->prog_data.tes, + .input_vue_map = tcs_shader_data ? + &tcs_shader_data->prog_data.tcs.base.vue_map : NULL, + }; + + tes_shader_data->code = (void *)brw_compile_tes(compiler, ¶ms); + *error_str = params.base.error_str; +} + +static void +anv_shader_compile_gs(struct anv_device *device, + void *mem_ctx, + struct anv_shader_data *shader_data, + char **error_str) +{ + const struct brw_compiler *compiler = device->physical->compiler; + nir_shader *nir = shader_data->info->nir; + + shader_data->num_stats = 1; + + struct brw_compile_gs_params params = { + .base = { + .nir = nir, + .stats = shader_data->stats, + .log_data = device, + .mem_ctx = mem_ctx, + .source_hash = shader_data->source_hash, + }, + .key = &shader_data->key.gs, + .prog_data = &shader_data->prog_data.gs, + }; + + shader_data->code = (void *)brw_compile_gs(compiler, ¶ms); + *error_str = params.base.error_str; +} + +static void +anv_shader_compile_task(struct anv_device *device, + void *mem_ctx, + struct anv_shader_data *shader_data, + char **error_str) +{ + const struct brw_compiler *compiler = device->physical->compiler; + nir_shader *nir = shader_data->info->nir; + + shader_data->num_stats = 1; + + struct brw_compile_task_params params = { + .base = { + .nir = nir, + .stats = shader_data->stats, + .log_data = device, + .mem_ctx = mem_ctx, + .source_hash = shader_data->source_hash, + }, + .key = &shader_data->key.task, + .prog_data = &shader_data->prog_data.task, + }; + + shader_data->code = (void *)brw_compile_task(compiler, ¶ms); + *error_str = params.base.error_str; +} + +static nir_def * +mesh_load_provoking_vertex(nir_builder *b, void *data) +{ + return nir_load_inline_data_intel( + b, 1, 32, + .base = ANV_INLINE_PARAM_MESH_PROVOKING_VERTEX); +} + +static void +anv_shader_compile_mesh(struct anv_device *device, + void *mem_ctx, + struct anv_shader_data *mesh_shader_data, + struct anv_shader_data *task_shader_data, + char **error_str) +{ + const struct brw_compiler *compiler = device->physical->compiler; + nir_shader *nir = mesh_shader_data->info->nir; + + mesh_shader_data->num_stats = 1; + + struct brw_compile_mesh_params params = { + .base = { + .nir = nir, + .stats = mesh_shader_data->stats, + .log_data = device, + .mem_ctx = mem_ctx, + .source_hash = mesh_shader_data->source_hash, + }, + .key = &mesh_shader_data->key.mesh, + .prog_data = &mesh_shader_data->prog_data.mesh, + .tue_map = task_shader_data ? + &task_shader_data->prog_data.task.map : + NULL, + .load_provoking_vertex = mesh_load_provoking_vertex, + }; + + mesh_shader_data->code = (void *)brw_compile_mesh(compiler, ¶ms); + *error_str = params.base.error_str; +} + +static void +anv_shader_compile_fs(struct anv_device *device, + void *mem_ctx, + struct anv_shader_data *shader_data, + const struct vk_graphics_pipeline_state *state, + char **error_str) +{ + const struct brw_compiler *compiler = device->physical->compiler; + nir_shader *nir = shader_data->info->nir; + + /* When using Primitive Replication for multiview, each view gets its own + * position slot. + */ + uint32_t pos_slots = shader_data->use_primitive_replication ? + MAX2(1, util_bitcount(shader_data->key.base.view_mask)) : 1; + + struct intel_vue_map prev_vue_map; + brw_compute_vue_map(compiler->devinfo, + &prev_vue_map, + nir->info.inputs_read, + nir->info.separate_shader, + pos_slots); + + shader_data->key.wm.input_slots_valid = prev_vue_map.slots_valid; + + struct brw_compile_fs_params params = { + .base = { + .nir = nir, + .stats = shader_data->stats, + .log_data = device, + .mem_ctx = mem_ctx, + .source_hash = shader_data->source_hash, + }, + .key = &shader_data->key.wm, + .prog_data = &shader_data->prog_data.wm, + .mue_map = shader_data->mue_map, + + .allow_spilling = true, + .max_polygons = UCHAR_MAX, + }; + + shader_data->code = (void *)brw_compile_fs(compiler, ¶ms); + *error_str = params.base.error_str; + + shader_data->num_stats = (uint32_t)!!shader_data->prog_data.wm.dispatch_multi + + (uint32_t)shader_data->prog_data.wm.dispatch_8 + + (uint32_t)shader_data->prog_data.wm.dispatch_16 + + (uint32_t)shader_data->prog_data.wm.dispatch_32; + assert(shader_data->num_stats <= ARRAY_SIZE(shader_data->stats)); + + /* Update the push constant padding range now that we know the amount of + * per-primitive data delivered in the payload. + */ + for (unsigned i = 0; i < ARRAY_SIZE(shader_data->bind_map.push_ranges); i++) { + if (shader_data->bind_map.push_ranges[i].set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING) { + shader_data->bind_map.push_ranges[i].length = MAX2( + shader_data->prog_data.wm.num_per_primitive_inputs / 2, + shader_data->bind_map.push_ranges[i].length); + break; + } + } +} + +static void +anv_shader_compile_cs(struct anv_device *device, + void *mem_ctx, + struct anv_shader_data *shader_data, + char **error_str) +{ + const struct brw_compiler *compiler = device->physical->compiler; + nir_shader *nir = shader_data->info->nir; + + shader_data->num_stats = 1; + + struct brw_compile_cs_params params = { + .base = { + .nir = nir, + .stats = shader_data->stats, + .log_data = device, + .mem_ctx = mem_ctx, + .source_hash = shader_data->source_hash, + }, + .key = &shader_data->key.cs, + .prog_data = &shader_data->prog_data.cs, + }; + + shader_data->code = (void *)brw_compile_cs(compiler, ¶ms); + *error_str = params.base.error_str; +} + +static bool +should_remat_cb(nir_instr *instr, void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_resource_intel; +} + +static void +anv_shader_compile_bs(struct anv_device *device, + void *mem_ctx, + struct anv_shader_data *shader_data, + char **error_str) +{ + const struct brw_compiler *compiler = device->physical->compiler; + nir_shader *nir = shader_data->info->nir; + const struct intel_device_info *devinfo = compiler->devinfo; + + struct brw_nir_lower_shader_calls_state lowering_state = { + .devinfo = devinfo, + .key = &shader_data->key.bs, + }; + + nir_shader **resume_shaders = NULL; + uint32_t num_resume_shaders = 0; + if (nir->info.stage != MESA_SHADER_COMPUTE) { + const nir_lower_shader_calls_options opts = { + .address_format = nir_address_format_64bit_global, + .stack_alignment = BRW_BTD_STACK_ALIGN, + .localized_loads = true, + .vectorizer_callback = brw_nir_should_vectorize_mem, + .vectorizer_data = NULL, + .should_remat_callback = should_remat_cb, + }; + + NIR_PASS(_, nir, nir_lower_shader_calls, &opts, + &resume_shaders, &num_resume_shaders, mem_ctx); + NIR_PASS(_, nir, brw_nir_lower_shader_calls, &lowering_state); + NIR_PASS(_, nir, brw_nir_lower_rt_intrinsics, + &shader_data->key.base, devinfo); + } + + for (unsigned i = 0; i < num_resume_shaders; i++) { + NIR_PASS(_, resume_shaders[i], brw_nir_lower_shader_calls, + &lowering_state); + NIR_PASS(_, resume_shaders[i], brw_nir_lower_rt_intrinsics, + &shader_data->key.base, devinfo); + } + + shader_data->num_stats = 1; + + struct brw_compile_bs_params params = { + .base = { + .nir = nir, + .stats = shader_data->stats, + .log_data = device, + .mem_ctx = mem_ctx, + .source_hash = shader_data->source_hash, + }, + .key = &shader_data->key.bs, + .prog_data = &shader_data->prog_data.bs, + .num_resume_shaders = num_resume_shaders, + .resume_shaders = resume_shaders, + }; + + shader_data->code = (void *)brw_compile_bs(compiler, ¶ms); + *error_str = params.base.error_str; +} + +static void +shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + uint32_t comp_size = glsl_type_is_boolean(type) + ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + *size = comp_size * length, + *align = comp_size * (length == 3 ? 4 : length); +} + +static void +anv_shader_compute_fragment_rts(const struct brw_compiler *compiler, + const struct vk_graphics_pipeline_state *state, + struct anv_shader_data *shader_data) +{ + assert(shader_data->bind_map.surface_count == 0); + + nir_shader *nir = shader_data->info->nir; + const uint64_t rt_mask = nir->info.outputs_written >> FRAG_RESULT_DATA0; + const unsigned num_rts = util_last_bit64(rt_mask); + struct anv_pipeline_binding rt_bindings[MAX_RTS]; + + shader_data->key.wm.color_outputs_valid = rt_mask & rp_color_mask(state); + shader_data->key.wm.nr_color_regions = + util_last_bit(shader_data->key.wm.color_outputs_valid); + + if (num_rts > 0) { + for (unsigned rt = 0; rt < num_rts; rt++) { + if (nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_DATA0 + rt)) { + rt_bindings[rt] = (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, + .index = rt, + .binding = UINT32_MAX, + }; + } else { + /* Setup a null render target */ + rt_bindings[rt] = (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, + .index = ANV_COLOR_OUTPUT_UNUSED, + .binding = UINT32_MAX, + }; + } + } + shader_data->bind_map.surface_count = num_rts; + } else if (brw_nir_fs_needs_null_rt( + compiler->devinfo, nir, + shader_data->key.wm.alpha_to_coverage != INTEL_NEVER)) { + /* Setup a null render target */ + rt_bindings[0] = (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, + .index = ANV_COLOR_OUTPUT_DISABLED, + .binding = UINT32_MAX, + }; + shader_data->bind_map.surface_count = 1; + } + + typed_memcpy(shader_data->bind_map.surface_to_descriptor, + rt_bindings, shader_data->bind_map.surface_count); +} + +static bool +accept_64bit_atomic_cb(const nir_intrinsic_instr *intrin, const void *data) +{ + return (intrin->intrinsic == nir_intrinsic_image_atomic || + intrin->intrinsic == nir_intrinsic_image_atomic_swap || + intrin->intrinsic == nir_intrinsic_image_deref_atomic || + intrin->intrinsic == nir_intrinsic_image_deref_atomic_swap) && + intrin->def.bit_size == 64; +} + +static bool +lower_non_tg4_non_uniform_offsets(const nir_tex_instr *tex, + unsigned index, void *data) +{ + /* HW cannot deal with divergent surfaces/samplers */ + if (tex->src[index].src_type == nir_tex_src_texture_offset || + tex->src[index].src_type == nir_tex_src_texture_handle || + tex->src[index].src_type == nir_tex_src_sampler_offset || + tex->src[index].src_type == nir_tex_src_sampler_handle) + return true; + + if (tex->src[index].src_type == nir_tex_src_offset) { + /* HW can deal with TG4 divergent offsets only */ + return tex->op != nir_texop_tg4; + } + + return false; +} + +static void +fixup_large_workgroup_image_coherency(nir_shader *nir) +{ + nir_foreach_function_impl(impl, nir) { + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + if (intr->intrinsic != nir_intrinsic_image_deref_store || + nir_intrinsic_image_dim(intr) != GLSL_SAMPLER_DIM_3D) + continue; + + /* We have found image store access to 3D. */ + nir_deref_instr *array_deref = nir_src_as_deref(intr->src[0]); + if (array_deref->deref_type != nir_deref_type_array) + continue; + + nir_alu_instr *alu = nir_src_as_alu_instr(intr->src[1]); + if (!alu || !nir_op_is_vec(alu->op)) + return; + + /* Check if any src is from @load_local_invocation_id. */ + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { + nir_instr *parent = alu->src[i].src.ssa->parent_instr; + if (parent->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *parent_intr = nir_instr_as_intrinsic(parent); + if (parent_intr->intrinsic != + nir_intrinsic_load_local_invocation_id) + continue; + + /* Found a match, change image access qualifier coherent. */ + nir_deref_instr *parent_deref = + nir_src_as_deref(array_deref->parent); + parent_deref->var->data.access = ACCESS_COHERENT; + return; + } + } /* instr */ + } /* block */ + } /* func */ +} + +static void +anv_shader_lower_nir(struct anv_device *device, + void *mem_ctx, + const struct vk_graphics_pipeline_state *state, + struct anv_shader_data *shader_data) +{ + const struct anv_physical_device *pdevice = device->physical; + const struct brw_compiler *compiler = pdevice->compiler; + struct anv_descriptor_set_layout * const *set_layouts = + (struct anv_descriptor_set_layout * const *) shader_data->info->set_layouts; + const uint32_t set_layout_count = shader_data->info->set_layout_count; + + nir_shader *nir = shader_data->info->nir; + + /* Workaround for apps that need fp64 support */ + if (device->fp64_nir) { + NIR_PASS(_, nir, nir_lower_doubles, device->fp64_nir, + nir->options->lower_doubles_options); + + bool fp_conv = false; + NIR_PASS(fp_conv, nir, nir_lower_int64_float_conversions); + if (fp_conv) { + NIR_PASS(_, nir, nir_opt_algebraic); + NIR_PASS(_, nir, nir_lower_doubles, device->fp64_nir, + nir->options->lower_doubles_options); + } + } + + if (nir->info.stage == MESA_SHADER_COMPUTE && + pdevice->instance->large_workgroup_non_coherent_image_workaround) { + const unsigned local_size = nir->info.workgroup_size[0] * + nir->info.workgroup_size[1] * + nir->info.workgroup_size[2]; + if (local_size == 64) + fixup_large_workgroup_image_coherency(nir); + } + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, nir_lower_wpos_center); + NIR_PASS(_, nir, nir_lower_input_attachments, + &(nir_input_attachment_options) { + .use_fragcoord_sysval = true, + .use_layer_id_sysval = true, + }); + } + + if (nir->info.stage == MESA_SHADER_COMPUTE && + shader_data->key.cs.lower_unaligned_dispatch) { + NIR_PASS(_, nir, anv_nir_lower_unaligned_dispatch); + /* anv_nir_lower_unaligned_dispatch pass uses nir_jump_return that we + * need to lower it. + */ + NIR_PASS(_, nir, nir_lower_returns); + /* Lower load_base_workgroup_id inserted by unaligned_dispatch */ + nir_lower_compute_system_values_options lower_csv_options = { + .has_base_workgroup_id = true, + }; + NIR_PASS(_, nir, nir_lower_compute_system_values, &lower_csv_options); + } + + if (mesa_shader_stage_is_mesh(nir->info.stage)) { + nir_lower_compute_system_values_options options = { + .lower_workgroup_id_to_index = true, + /* nir_lower_idiv generates expensive code */ + .shortcut_1d_workgroup_id = compiler->devinfo->verx10 >= 125, + }; + + NIR_PASS(_, nir, nir_lower_compute_system_values, &options); + } + + NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion, shader_data->info); + + if (nir->info.stage <= MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, anv_nir_lower_multiview, + shader_data->key.base.view_mask, + shader_data->use_primitive_replication); + } + + if (nir->info.stage == MESA_SHADER_COMPUTE && + nir->info.cs.has_cooperative_matrix) { + anv_fixup_subgroup_size(pdevice->instance, &nir->info); + NIR_PASS(_, nir, brw_nir_lower_cmat, nir->info.subgroup_size); + NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, 16); + } + + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + + /* Ensure robustness, do this before brw_nir_lower_storage_image so that + * added image size intrinsics for bounds checkings are properly lowered + * for cube images. + */ + NIR_PASS(_, nir, nir_lower_robust_access, + accept_64bit_atomic_cb, NULL); + + NIR_PASS(_, nir, brw_nir_lower_storage_image, compiler, + &(struct brw_nir_lower_storage_image_opts) { + /* Anv only supports Gfx9+ which has better defined typed read + * behavior. It allows us to only have to care about lowering + * loads. + */ + .lower_loads = true, + .lower_stores_64bit = true, + .lower_loads_without_formats = + pdevice->instance->emulate_read_without_format, + }); + + /* Switch from image to global */ + NIR_PASS(_, nir, nir_lower_image_atomics_to_global, + accept_64bit_atomic_cb, NULL); + + /* Detile for global */ + NIR_PASS(_, nir, brw_nir_lower_texel_address, compiler->devinfo, + pdevice->isl_dev.shader_tiling); + + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global, + nir_address_format_64bit_global); + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const, + nir_address_format_32bit_offset); + + NIR_PASS(_, nir, brw_nir_lower_ray_queries, &pdevice->info); + + shader_data->push_desc_info.used_descriptors = + anv_nir_compute_used_push_descriptors( + nir, set_layouts, set_layout_count); + + /* Need to have render targets placed first in the bind_map */ + if (nir->info.stage == MESA_SHADER_FRAGMENT) + anv_shader_compute_fragment_rts(compiler, state, shader_data); + + /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */ + NIR_PASS(_, nir, anv_nir_apply_pipeline_layout, + pdevice, shader_data->key.base.robust_flags, + set_layouts, set_layout_count, NULL, /* TODO? */ + &shader_data->bind_map, &shader_data->push_map, mem_ctx); + + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo, + anv_nir_ubo_addr_format(pdevice, shader_data->key.base.robust_flags)); + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo, + anv_nir_ssbo_addr_format(pdevice, shader_data->key.base.robust_flags)); + + /* First run copy-prop to get rid of all of the vec() that address + * calculations often create and then constant-fold so that, when we + * get to anv_nir_lower_ubo_loads, we can detect constant offsets. + */ + bool progress; + do { + progress = false; + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_opt_dce); + } while (progress); + + /* Required for nir_divergence_analysis() which is needed for + * anv_nir_lower_ubo_loads. + */ + NIR_PASS(_, nir, nir_convert_to_lcssa, true, true); + nir_divergence_analysis(nir); + + NIR_PASS(_, nir, anv_nir_lower_ubo_loads); + + NIR_PASS(_, nir, nir_opt_remove_phis); + + enum nir_lower_non_uniform_access_type lower_non_uniform_access_types = + nir_lower_non_uniform_texture_access | + nir_lower_non_uniform_image_access | + nir_lower_non_uniform_get_ssbo_size; + + /* For textures, images, sampler, NonUniform decoration is required but not + * for offsets, so we rely on divergence information for this. Offsets used + * to be constants until KHR_maintenance8. + */ + if (device->vk.enabled_features.maintenance8) { + nir_foreach_function_impl(impl, nir) + nir_metadata_require(impl, nir_metadata_divergence); + } + + /* In practice, most shaders do not have non-uniform-qualified + * accesses (see + * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17558#note_1475069) + * thus a cheaper and likely to fail check is run first. + */ + if (nir_has_non_uniform_access(nir, lower_non_uniform_access_types)) { + NIR_PASS(_, nir, nir_opt_non_uniform_access); + + /* We don't support non-uniform UBOs and non-uniform SSBO access is + * handled naturally by falling back to A64 messages. + */ + NIR_PASS(_, nir, nir_lower_non_uniform_access, + &(nir_lower_non_uniform_access_options) { + .types = lower_non_uniform_access_types, + .tex_src_callback = lower_non_tg4_non_uniform_offsets, + .callback = NULL, + }); + + NIR_PASS(_, nir, intel_nir_lower_non_uniform_resource_intel); + NIR_PASS(_, nir, intel_nir_cleanup_resource_intel); + NIR_PASS(_, nir, nir_opt_dce); + } + + NIR_PASS(_, nir, anv_nir_update_resource_intel_block); + + NIR_PASS(_, nir, anv_nir_compute_push_layout, + pdevice, shader_data->key.base.robust_flags, + &(struct anv_nir_push_layout_info) { + .separate_tessellation = (nir->info.stage == MESA_SHADER_TESS_CTRL && + shader_data->key.tcs.separate_tess_vue_layout) || + (nir->info.stage == MESA_SHADER_TESS_EVAL && + shader_data->key.tes.separate_tess_vue_layout), + .fragment_dynamic = nir->info.stage == MESA_SHADER_FRAGMENT && + brw_wm_prog_key_is_dynamic(&shader_data->key.wm), + .mesh_dynamic = nir->info.stage == MESA_SHADER_FRAGMENT && + shader_data->key.wm.mesh_input == INTEL_SOMETIMES, + }, + &shader_data->key.base, + &shader_data->prog_data.base, + &shader_data->bind_map, &shader_data->push_map, + mem_ctx); + + NIR_PASS(_, nir, anv_nir_lower_resource_intel, pdevice, + shader_data->bind_map.layout_type); + + if (mesa_shader_stage_uses_workgroup(nir->info.stage)) { + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, + nir_var_mem_shared, shared_type_info); + + NIR_PASS(_, nir, nir_lower_explicit_io, + nir_var_mem_shared, nir_address_format_32bit_offset); + + if (nir->info.zero_initialize_shared_memory && + nir->info.shared_size > 0) { + /* The effective Shared Local Memory size is at least 1024 bytes and + * is always rounded to a power of two, so it is OK to align the size + * used by the shader to chunk_size -- which does simplify the logic. + */ + const unsigned chunk_size = 16; + const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size); + assert(shared_size <= + intel_compute_slm_calculate_size(compiler->devinfo->ver, + nir->info.shared_size)); + + NIR_PASS(_, nir, nir_zero_initialize_shared_memory, + shared_size, chunk_size); + } + } + + if (mesa_shader_stage_is_compute(nir->info.stage) || + mesa_shader_stage_is_mesh(nir->info.stage)) { + NIR_PASS(_, nir, brw_nir_lower_cs_intrinsics, compiler->devinfo, + &shader_data->prog_data.cs); + } + + shader_data->push_desc_info.push_set_buffer = + anv_nir_loads_push_desc_buffer( + nir, set_layouts, set_layout_count, &shader_data->bind_map); + shader_data->push_desc_info.fully_promoted_ubo_descriptors = + anv_nir_push_desc_ubo_fully_promoted( + nir, set_layouts, set_layout_count, &shader_data->bind_map); +} + +static uint32_t +sets_layout_embedded_sampler_count(const struct vk_shader_compile_info *info) +{ + uint32_t count = 0; + + for (uint32_t s = 0; s < info->set_layout_count; s++) { + if (info->set_layouts[s] == NULL) + continue; + + const struct anv_descriptor_set_layout *layout = + (const struct anv_descriptor_set_layout *) info->set_layouts[s]; + count += layout->embedded_sampler_count; + } + + return count; +} + +static void +anv_shaders_pre_lower_gfx(struct anv_device *device, + struct anv_shader_data *shaders_data, + uint32_t shader_count, + const struct vk_graphics_pipeline_state *state, + void *mem_ctx) +{ + const struct intel_device_info *devinfo = device->info; + const struct brw_compiler *compiler = device->physical->compiler; + + /* Walk backwards to link */ + struct anv_shader_data *next_stage = NULL; + for (int s = shader_count - 1; s >= 0; s--) { + struct anv_shader_data *shader_data = &shaders_data[s]; + struct vk_shader_compile_info *info = shader_data->info; + + if (next_stage == NULL) { + next_stage = shader_data; + continue; + } + + switch (info->stage) { + case MESA_SHADER_VERTEX: + case MESA_SHADER_TESS_EVAL: + case MESA_SHADER_TASK: + case MESA_SHADER_GEOMETRY: + anv_shader_link(compiler, info, next_stage->info); + break; + case MESA_SHADER_TESS_CTRL: + anv_shader_link_tcs(compiler, + &shader_data->key.tcs, + info, next_stage->info); + break; + case MESA_SHADER_MESH: + anv_shader_link(compiler, info, next_stage->info); + next_stage->mue_map = &shader_data->prog_data.mesh.map; + break; + default: + UNREACHABLE("Invalid graphics shader stage"); + } + + next_stage = shader_data; + } + + bool use_primitive_replication = false; + if (devinfo->ver >= 12 && shaders_data[0].key.base.view_mask != 0) { + /* For some pipelines HW Primitive Replication can be used instead of + * instancing to implement Multiview. This depend on how viewIndex is + * used in all the active shaders, so this check can't be done per + * individual shaders. + */ + nir_shader *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {}; + VkShaderStageFlags vk_stages = 0; + for (unsigned s = 0; s < shader_count; s++) { + struct anv_shader_data *shader_data = &shaders_data[s]; + shaders[shader_data->info->stage] = shader_data->info->nir; + vk_stages |= mesa_to_vk_shader_stage(shader_data->info->stage); + } + + use_primitive_replication = + anv_check_for_primitive_replication(device, vk_stages, shaders, + shaders_data[0].key.base.view_mask); + } + + for (uint32_t s = 0; s < shader_count; s++) { + struct anv_shader_data *shader_data = &shaders_data[s]; + shader_data->use_primitive_replication = use_primitive_replication; + shader_data->instance_multiplier = + (shader_data->key.base.view_mask && !use_primitive_replication) ? + util_bitcount(shader_data->key.base.view_mask) : 1; + } +} + +static void +anv_shaders_post_lower_gfx(struct anv_device *device, + struct anv_shader_data *shaders_data, + uint32_t shader_count, + const struct vk_graphics_pipeline_state *state) +{ + const struct brw_compiler *compiler = device->physical->compiler; + + struct vk_shader_compile_info *prev_stage = NULL; + for (uint32_t s = 0; s < shader_count; s++) { + struct anv_shader_data *shader_data = &shaders_data[s]; + struct vk_shader_compile_info *info = shader_data->info; + + struct shader_info *cur_info = &shader_data->info->nir->info; + + if (prev_stage && compiler->nir_options[info->stage]->unify_interfaces) { + struct shader_info *prev_info = &prev_stage->nir->info; + + prev_info->outputs_written |= cur_info->inputs_read & + ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); + cur_info->inputs_read |= prev_info->outputs_written & + ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); + prev_info->patch_outputs_written |= cur_info->patch_inputs_read; + cur_info->patch_inputs_read |= prev_info->patch_outputs_written; + } + + prev_stage = info; + } +} + +static void +anv_shaders_post_lower_rt(struct anv_device *device, + struct anv_shader_data *shaders_data, + uint32_t shader_count) +{ + for (uint32_t s = 0; s < shader_count; s++) { + struct anv_shader_data *shader_data = &shaders_data[s]; + nir_shader *nir = shader_data->info->nir; + + switch (nir->info.stage) { + case MESA_SHADER_RAYGEN: + brw_nir_lower_raygen(nir, device->info); + break; + + case MESA_SHADER_ANY_HIT: + brw_nir_lower_any_hit(nir, device->info); + break; + + case MESA_SHADER_CLOSEST_HIT: + brw_nir_lower_closest_hit(nir, device->info); + break; + + case MESA_SHADER_MISS: + brw_nir_lower_miss(nir, device->info); + break; + + case MESA_SHADER_CALLABLE: + brw_nir_lower_callable(nir, device->info); + break; + + case MESA_SHADER_INTERSECTION: + /* Nothing to do, we merge this into ANY_HIT */ + break; + + default: + UNREACHABLE("invalid stage"); + } + } +} + +static VkShaderStageFlags +anv_shader_get_rt_group_linking(struct vk_physical_device *device, + VkShaderStageFlags stages) +{ + const VkShaderStageFlags any_hit_intersection = + VK_SHADER_STAGE_ANY_HIT_BIT_KHR | + VK_SHADER_STAGE_INTERSECTION_BIT_KHR; + + return (stages & any_hit_intersection) == any_hit_intersection ? + any_hit_intersection : 0; +} + +static VkResult +anv_shader_compile(struct vk_device *vk_device, + uint32_t shader_count, + struct vk_shader_compile_info *infos, + const struct vk_graphics_pipeline_state *state, + const struct vk_features *enabled_features, + const VkAllocationCallbacks* pAllocator, + struct vk_shader **shaders_out) +{ + struct anv_device *device = + container_of(vk_device, struct anv_device, vk); + VkResult result = VK_SUCCESS; + + for (uint32_t i = 0; i < shader_count; i++) + shaders_out[i] = NULL; + + void *mem_ctx = ralloc_context(NULL); + + struct anv_shader_data *shaders_data = + rzalloc_array(mem_ctx, struct anv_shader_data, shader_count); + assert(shader_count < MAX2(ANV_GRAPHICS_SHADER_STAGE_COUNT, + ANV_RT_SHADER_STAGE_COUNT)); + + /* Order the stages (no guarantee from the runtime) */ + struct vk_shader_compile_info *ordered_infos[MESA_SHADER_KERNEL] = { 0 }; + struct vk_shader **ordered_shaders_out[MESA_SHADER_KERNEL] = { 0 }; + VkShaderStageFlags stages = 0; + for (uint32_t s = 0; s < shader_count; s++) { + ordered_infos[infos[s].stage] = &infos[s]; + ordered_shaders_out[infos[s].stage] = &shaders_out[s]; + + /* The runtime transfers the ownership of the NIR to us, so we need to + * free it after compile. + */ + ralloc_steal(mem_ctx, infos[s].nir); + stages |= mesa_to_vk_shader_stage(infos[s].stage); + } + + { +#define ADD_SHADER(_name) \ + do { \ + if (ordered_infos[MESA_SHADER_##_name]) { \ + shaders_data[remapped_index].info = \ + ordered_infos[MESA_SHADER_##_name]; \ + shaders_data[remapped_index].shader_out = \ + ordered_shaders_out[MESA_SHADER_##_name]; \ + remapped_index++; \ + } \ + } while (0) + + uint32_t remapped_index = 0; + ADD_SHADER(COMPUTE); + ADD_SHADER(VERTEX); + ADD_SHADER(TESS_CTRL); + ADD_SHADER(TESS_EVAL); + ADD_SHADER(GEOMETRY); + ADD_SHADER(TASK); + ADD_SHADER(MESH); + ADD_SHADER(FRAGMENT); + ADD_SHADER(RAYGEN); + ADD_SHADER(CLOSEST_HIT); + ADD_SHADER(INTERSECTION); + ADD_SHADER(ANY_HIT); + ADD_SHADER(MISS); + ADD_SHADER(CALLABLE); + +#undef ADD_SHADER + } + + /* From now on, don't use infos[] anymore. */ + + for (uint32_t s = 0; s < shader_count; s++) { + struct anv_shader_data *shader_data = &shaders_data[s]; + struct vk_shader_compile_info *info = shader_data->info; + + shader_data->source_hash = ((uint32_t*)info->nir->info.source_blake3)[0]; + + shader_data->bind_map.layout_type = + set_layouts_get_layout_type((struct anv_descriptor_set_layout * const *)info->set_layouts, + info->set_layout_count); + shader_data->bind_map.surface_to_descriptor = + brw_shader_stage_requires_bindless_resources(info->stage) ? NULL : + rzalloc_array(mem_ctx, struct anv_pipeline_binding, 256); + shader_data->bind_map.sampler_to_descriptor = + brw_shader_stage_requires_bindless_resources(info->stage) ? NULL : + rzalloc_array(mem_ctx, struct anv_pipeline_binding, 256); + shader_data->bind_map.embedded_sampler_to_binding = + rzalloc_array(mem_ctx, struct anv_pipeline_embedded_sampler_binding, + sets_layout_embedded_sampler_count(info)); + + shader_data->prog_data.base.stage = info->stage; + + switch (info->stage) { + case MESA_SHADER_VERTEX: + populate_vs_prog_key(&shader_data->key.vs, vk_device->physical, + info->robustness, state, stages); + break; + case MESA_SHADER_TESS_CTRL: + populate_tcs_prog_key(&shader_data->key.tcs, vk_device->physical, + info->robustness, state, stages); + break; + case MESA_SHADER_TESS_EVAL: + populate_tes_prog_key(&shader_data->key.tes, vk_device->physical, + info->robustness, state, stages); + break; + case MESA_SHADER_GEOMETRY: + populate_gs_prog_key(&shader_data->key.gs, vk_device->physical, + info->robustness, state, stages); + break; + case MESA_SHADER_TASK: + populate_task_prog_key(&shader_data->key.task, vk_device->physical, + info->robustness, state, stages); + break; + case MESA_SHADER_MESH: + populate_mesh_prog_key(&shader_data->key.mesh, vk_device->physical, + info->robustness, state, stages); + break; + case MESA_SHADER_FRAGMENT: + populate_wm_prog_key(&shader_data->key.wm, vk_device->physical, + info->robustness, state, stages); + break; + case MESA_SHADER_COMPUTE: + populate_cs_prog_key(&shader_data->key.cs, vk_device->physical, + info->robustness, + info->flags & VK_SHADER_CREATE_UNALIGNED_DISPATCH_BIT_MESA); + break; + case MESA_SHADER_RAYGEN: + case MESA_SHADER_ANY_HIT: + case MESA_SHADER_CLOSEST_HIT: + case MESA_SHADER_MISS: + case MESA_SHADER_INTERSECTION: + case MESA_SHADER_CALLABLE: + populate_bs_prog_key(&shader_data->key.bs, vk_device->physical, + info->robustness, info->rt_flags); + break; + default: + UNREACHABLE("Invalid stage"); + } + } + + { + /* We're going to do cross stage link if we have a fragment shader with + * any other stage (that would include all the associated + * pre-rasterization stages of the pipeline). + */ + const bool separate_shaders = + !(shader_count > 1 && ordered_infos[MESA_SHADER_FRAGMENT] != NULL); + + for (uint32_t s = 0; s < shader_count; s++) + shaders_data[s].info->nir->info.separate_shader = separate_shaders; + } + + if (mesa_shader_stage_is_graphics(shaders_data[0].info->stage)) { + anv_shaders_pre_lower_gfx(device, shaders_data, shader_count, + state, mem_ctx); + } + + for (uint32_t s = 0; s < shader_count; s++) { + struct anv_shader_data *shader_data = &shaders_data[s]; + + anv_shader_lower_nir(device, mem_ctx, state, shader_data); + + anv_fixup_subgroup_size(device->physical->instance, + &shader_data->info->nir->info); + } + + /* Combine intersection & any-hit before lowering */ + if (ordered_infos[MESA_SHADER_INTERSECTION] != NULL) { + brw_nir_lower_combined_intersection_any_hit( + ordered_infos[MESA_SHADER_INTERSECTION]->nir, + ordered_infos[MESA_SHADER_ANY_HIT] != NULL ? + ordered_infos[MESA_SHADER_ANY_HIT]->nir : NULL, + device->info); + } + + if (mesa_shader_stage_is_graphics(shaders_data[0].info->stage)) + anv_shaders_post_lower_gfx(device, shaders_data, shader_count, state); + else if (mesa_shader_stage_is_rt(shaders_data[0].info->stage)) + anv_shaders_post_lower_rt(device, shaders_data, shader_count); + + for (uint32_t s = 0; s < shader_count; s++) { + struct anv_shader_data *shader_data = &shaders_data[s]; + struct anv_shader_data *prev_shader_data = + s > 0 ? &shaders_data[s - 1] : NULL; + + char *error_str = NULL; + switch (shader_data->info->stage) { + case MESA_SHADER_VERTEX: + anv_shader_compile_vs(device, mem_ctx, shader_data, &error_str); + break; + case MESA_SHADER_TESS_CTRL: + anv_shader_compile_tcs(device, mem_ctx, shader_data, &error_str); + break; + case MESA_SHADER_TESS_EVAL: + anv_shader_compile_tes(device, mem_ctx, + &shaders_data[s], prev_shader_data, + &error_str); + break; + case MESA_SHADER_GEOMETRY: + anv_shader_compile_gs(device, mem_ctx, shader_data, &error_str); + break; + case MESA_SHADER_TASK: + anv_shader_compile_task(device, mem_ctx, shader_data, &error_str); + break; + case MESA_SHADER_MESH: + anv_shader_compile_mesh(device, mem_ctx, + &shaders_data[s], prev_shader_data, + &error_str); + break; + case MESA_SHADER_FRAGMENT: + anv_shader_compile_fs(device, mem_ctx, shader_data, state, &error_str); + break; + case MESA_SHADER_COMPUTE: + anv_shader_compile_cs(device, mem_ctx, shader_data, &error_str); + break; + case MESA_SHADER_RAYGEN: + case MESA_SHADER_ANY_HIT: + case MESA_SHADER_CLOSEST_HIT: + case MESA_SHADER_MISS: + case MESA_SHADER_INTERSECTION: + case MESA_SHADER_CALLABLE: + anv_shader_compile_bs(device, mem_ctx, shader_data, &error_str); + break; + default: + UNREACHABLE("Invalid graphics shader stage"); + } + + if (shader_data->code == NULL) { + if (error_str) + result = vk_errorf(device, VK_ERROR_UNKNOWN, "%s", error_str); + else + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + + anv_nir_validate_push_layout(device->physical, + &shader_data->prog_data.base, + &shader_data->bind_map); + + shader_data->xfb_info = shader_data->info->nir->xfb_info; + + result = anv_shader_create(device, shader_data->info->stage, + shader_data, pAllocator, + shader_data->shader_out); + if (result != VK_SUCCESS) + goto fail; + } + + ralloc_free(mem_ctx); + +#if 0 + /* TODO: Write the feedback index into the pipeline */ + for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) { + anv_pipeline_account_shader(&pipeline->base, pipeline->shaders[s]); + } +#endif + + return VK_SUCCESS; + +fail: + ralloc_free(mem_ctx); + + for (unsigned s = 0; s < shader_count; s++) { + if (shaders_out[s] != NULL) + vk_shader_free(vk_device, &vk_device->alloc, shaders_out[s]); + } + + return result; +} + +struct vk_device_shader_ops anv_device_shader_ops = { + .get_nir_options = anv_shader_get_nir_options, + .get_spirv_options = anv_shader_get_spirv_options, + .preprocess_nir = anv_shader_preprocess_nir, + .get_rt_group_linking = anv_shader_get_rt_group_linking, + .hash_state = anv_shader_hash_state, + .compile = anv_shader_compile, + .deserialize = anv_shader_deserialize, + .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state, +}; diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index e52246cb952..01dc6bff2b1 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -187,6 +187,8 @@ libanv_files = files( 'anv_rmv.c', 'anv_rmv.h', 'anv_sampler.c', + 'anv_shader.c', + 'anv_shader_compile.c', 'anv_slab_bo.c', 'anv_slab_bo.h', 'anv_sparse.c',