From 6f5d30c0a2f55a79831ab754dc4b13625c87512d Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Wed, 5 Jun 2024 15:48:55 +0300 Subject: [PATCH] anv: add apply_layout support for device bindable shaders/pipelines We consider them like bindless stages (no binding table) as much as possible. Signed-off-by: Lionel Landwerlin Acked-by: Alyssa Rosenzweig Part-of: --- src/intel/compiler/intel_shader_enums.h | 1 + src/intel/vulkan/anv_nir.h | 1 + .../vulkan/anv_nir_apply_pipeline_layout.c | 270 ++++++++++++++---- src/intel/vulkan/anv_shader.c | 4 + src/intel/vulkan/anv_shader_compile.c | 3 +- 5 files changed, 224 insertions(+), 55 deletions(-) diff --git a/src/intel/compiler/intel_shader_enums.h b/src/intel/compiler/intel_shader_enums.h index de04d69514b..ad582f5a66f 100644 --- a/src/intel/compiler/intel_shader_enums.h +++ b/src/intel/compiler/intel_shader_enums.h @@ -604,6 +604,7 @@ enum intel_shader_reloc_id { BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH, BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH, BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH, + BRW_SHADER_RELOC_PUSH_DESCRIPTORS_BUFFER_ADDR_HIGH, BRW_SHADER_RELOC_DESCRIPTORS_VIEW_HANDLE, BRW_SHADER_RELOC_DESCRIPTORS_BUFFERS_VIEW_HANDLE, BRW_SHADER_RELOC_INSTRUCTION_BASE_ADDR_HIGH, diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h index f5878bd5e75..e162f6ce235 100644 --- a/src/intel/vulkan/anv_nir.h +++ b/src/intel/vulkan/anv_nir.h @@ -113,6 +113,7 @@ bool anv_nir_apply_pipeline_layout(nir_shader *shader, struct anv_descriptor_set_layout * const *set_layouts, uint32_t set_count, const uint32_t *dynamic_offset_start, + bool device_bindable, struct anv_pipeline_bind_map *map, struct anv_pipeline_push_map *push_map, void *push_map_mem_ctx); diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index fda002cc239..a11b6ee836f 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -64,9 +64,11 @@ struct apply_pipeline_layout_state { bool uses_constants; bool has_dynamic_buffers; + bool is_device_bindable; uint8_t constants_offset; nir_variable *set_idx_to_bti; + nir_variable *set_idx_to_base_addr; nir_variable *set_idx_to_offset; struct { @@ -505,8 +507,6 @@ build_load_descriptor_mem_from_res_index(nir_builder *b, { struct res_index_defs res = unpack_res_index(b, res_index); - nir_def *surface_index = nir_load_array_var(b, state->set_idx_to_bti, res.set); - nir_def *offset32 = nir_iadd_imm( b, nir_iadd(b, @@ -515,12 +515,24 @@ build_load_descriptor_mem_from_res_index(nir_builder *b, nir_imul(b, res.array_index, res.desc_stride))), imm_offset); - return nir_load_ubo(b, num_components, bit_size, - surface_index, offset32, - .align_mul = 8, - .align_offset = imm_offset % 8, - .range_base = 0, - .range = num_components * bit_size / 8); + if (state->pdevice->info.has_lsc || !state->is_device_bindable) { + nir_def *surface_index = nir_load_array_var(b, state->set_idx_to_bti, res.set); + + return nir_load_ubo(b, num_components, bit_size, + surface_index, offset32, + .align_mul = 8, + .align_offset = imm_offset % 8, + .range_base = 0, + .range = num_components * bit_size / 8); + } else { + return nir_load_global_constant( + b, num_components, bit_size, + nir_pack_64_2x32_split(b, offset32, + nir_load_array_var(b, state->set_idx_to_base_addr, + res.set)), + .align_mul = 8, + .align_offset = imm_offset % 8); + } } /* When using direct descriptor, we do not have a structure to read in memory @@ -704,30 +716,6 @@ build_load_storage_3d_image_depth(nir_builder *b, } } -/** Build a 64bit_global_32bit_offset address for a descriptor set */ -static nir_def * -build_desc_address64(nir_builder *b, nir_def *set_idx, unsigned set_idx_imm, - const struct apply_pipeline_layout_state *state) -{ - nir_def *desc_offset = set_idx != NULL ? - anv_load_driver_uniform_indexed(b, 1, desc_surface_offsets, set_idx) : - anv_load_driver_uniform(b, 1, desc_surface_offsets[set_idx_imm]); - desc_offset = nir_iand_imm(b, desc_offset, ANV_DESCRIPTOR_SET_OFFSET_MASK); - if (state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER && - !intel_has_extended_bindless(&state->pdevice->info)) { - nir_def *bindless_base_offset = - anv_load_driver_uniform(b, 1, surfaces_base_offset); - desc_offset = nir_iadd(b, bindless_base_offset, desc_offset); - } - return nir_pack_64_2x32_split( - b, desc_offset, - nir_load_reloc_const_intel( - b, - state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER ? - BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH : - BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH)); -} - static nir_def * build_descriptor_set_bti(nir_builder *b, uint32_t set, @@ -769,12 +757,38 @@ build_descriptor_set_bti(nir_builder *b, } } +static nir_def * +build_descriptor_set_base_address(nir_builder *b, + uint32_t set, + const struct apply_pipeline_layout_state *state) +{ + const bool is_push_set = set < state->set_count && + state->set_layouts[set] != NULL && + (state->set_layouts[set]->vk.flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); + enum intel_shader_reloc_id reloc_id; + + if (state->pdevice->info.has_lsc) { + reloc_id = + state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER ? + BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH : + BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH; + } else { + reloc_id = + state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER ? + is_push_set ? BRW_SHADER_RELOC_PUSH_DESCRIPTORS_BUFFER_ADDR_HIGH : + BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH : + BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH; + } + + return nir_load_reloc_const_intel(b, reloc_id); +} + static nir_def * build_descriptor_set_offset(nir_builder *b, uint32_t set, const struct apply_pipeline_layout_state *state) { - if (state->pdevice->info.has_lsc) { + if (state->pdevice->info.has_lsc || state->is_device_bindable) { return nir_iand_imm(b, anv_load_driver_uniform(b, 1, desc_surface_offsets[set]), ANV_DESCRIPTOR_SET_OFFSET_MASK /* array_index */); @@ -801,6 +815,27 @@ build_desc_address32(nir_builder *b, offset)); } +/** Build a 64bit_bounded_global address for a descriptor set */ +static nir_def * +build_desc_address64(nir_builder *b, + nir_def *set_idx, unsigned set, + nir_def *offset, + uint32_t range, + const struct apply_pipeline_layout_state *state) +{ + return nir_vec4(b, + nir_iadd(b, + nir_load_array_var(b, state->set_idx_to_offset, + set < MAX_SETS ? + nir_imm_int(b, set) : set_idx), + offset), + nir_load_array_var(b, state->set_idx_to_base_addr, + set < MAX_SETS ? + nir_imm_int(b, set) : set_idx), + nir_imm_int(b, range), + nir_imm_int(b, 0)); +} + /** Whether a surface is accessed through the bindless surface state heap */ static bool is_binding_bindless(unsigned set, unsigned binding, bool sampler, @@ -880,19 +915,6 @@ build_desc_addr_for_binding(nir_builder *b, return build_desc_address32(b, NULL, set, desc_offset, state); } -static nir_def * -build_inline_desc_addr32(nir_builder *b, - unsigned set, - const struct anv_descriptor_set_binding_layout *bind_layout, - const struct apply_pipeline_layout_state *state) -{ - return nir_vec2( - b, - nir_load_array_var_imm(b, state->set_idx_to_bti, set), - nir_iadd_imm(b, nir_load_array_var_imm(b, state->set_idx_to_offset, set), - bind_layout->descriptor_surface_offset)); -} - static unsigned binding_descriptor_offset(const struct apply_pipeline_layout_state *state, const struct anv_descriptor_set_binding_layout *bind_layout, @@ -1237,9 +1259,16 @@ build_buffer_addr_for_binding(nir_builder *b, &state->set_layouts[set]->binding[binding]; if (bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - return build_desc_address32(b, NULL, set, - nir_imm_int(b, bind_layout->descriptor_surface_offset), - state); + if (!state->pdevice->info.has_lsc && state->is_device_bindable) { + return build_desc_address64(b, NULL, set, + nir_imm_int(b, bind_layout->descriptor_surface_offset), + bind_layout->array_size, + state); + } else { + return build_desc_address32(b, NULL, set, + nir_imm_int(b, bind_layout->descriptor_surface_offset), + state); + } } if (addr_format != nir_address_format_32bit_index_offset) @@ -1428,9 +1457,6 @@ try_lower_direct_buffer_intrinsic(nir_builder *b, return false; } - /* Rewrite to 32bit_index_offset whenever we can */ - addr_format = nir_address_format_32bit_index_offset; - /* If a dynamic has not been assigned a binding table entry, we need to * bail here. */ @@ -1438,6 +1464,12 @@ try_lower_direct_buffer_intrinsic(nir_builder *b, !descriptor_has_bti(desc, state)) return false; + /* Rewrite to 32bit_index_offset */ + addr_format = + (!state->pdevice->info.has_lsc && state->is_device_bindable) ? + nir_address_format_64bit_bounded_global : + nir_address_format_32bit_index_offset; + nir_def *addr = build_buffer_addr_for_deref(b, deref, addr_format, state); @@ -2190,6 +2222,115 @@ binding_should_use_sampler_binding_table(const struct apply_pipeline_layout_stat return true; } +/* This builds a binding table based on the push descriptor layout, so all the + * shaders using a compatible layout can share the same binding table. For LSC + * platforms the binding table should already by empty (except render + * targets), so it's already compatible. + */ +static void +build_device_bindable_binding_table(struct apply_pipeline_layout_state *state, + nir_shader *shader, + struct anv_pipeline_bind_map *map, + struct anv_pipeline_push_map *push_map, + void *push_map_mem_ctx) +{ + /* Compute the amount of push block items required. */ + unsigned push_block_count = map->surface_count + MAX_SETS; + for (unsigned s = 0; s < state->set_count; s++) { + const struct anv_descriptor_set_layout *set_layout = + state->set_layouts[s]; + if (!set_layout) + continue; + + for (unsigned b = 0; b < set_layout->binding_count; b++) { + if (set_layout->binding[b].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) + push_block_count += set_layout->binding[b].array_size; + } + } + + /* Assign a block index for each surface */ + push_map->block_to_descriptor = rzalloc_array(push_map_mem_ctx, + struct anv_pipeline_binding, + push_block_count); + + memcpy(push_map->block_to_descriptor, + map->surface_to_descriptor, + sizeof(push_map->block_to_descriptor[0]) * map->surface_count); + push_map->block_count = map->surface_count; + + /* No BTI allowed for descriptor sets, we'll use A64 messages. */ + for (unsigned s = 0; s < state->set_count; s++) { + state->set[s].desc_offset = BINDLESS_OFFSET; + + if (brw_shader_stage_requires_bindless_resources(shader->info.stage)) { + state->set[s].push_block = UINT32_MAX; + } else { + state->set[s].push_block = push_map->block_count; + add_descriptor_push_entry(push_map, s, state->bind_map); + } + } + + for (uint32_t set = 0; set < state->set_count; set++) { + struct anv_descriptor_set_layout *set_layout = + state->set_layouts[set]; + if (!set_layout) + continue; + + for (unsigned b = 0; b < set_layout->binding_count; b++) { + const struct anv_descriptor_set_binding_layout *bind_layout = + &set_layout->binding[b]; + + /* Assume bindless by default */ + state->set[set].binding[b].surface_offset = BINDLESS_OFFSET; + state->set[set].binding[b].sampler_offset = BINDLESS_OFFSET; + + if (binding_is_promotable_to_push(set_layout, bind_layout)) { + if (bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + state->set[set].binding[b].push_block = push_map->block_count; + for (unsigned i = 0; i < bind_layout->array_size; i++) + add_push_entry(push_map, set, b, i, bind_layout); + } + } + + if (!state->pdevice->info.has_lsc && + (set_layout->vk.flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) && + (set_layout->vk.flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT)) { + /* Embedded sampler have an array size limited to 1 */ + if (state->set[set].binding[b].properties & BINDING_PROPERTY_EMBEDDED_SAMPLER) + add_embedded_sampler_entry(state, map, set, b); + + const uint8_t max_planes = bti_multiplier(state, set, b); + + for (unsigned i = 0; i < bind_layout->array_size; i++) { + if (bind_layout->data & ANV_DESCRIPTOR_BTI_SURFACE_STATE) { + const uint8_t max_sampler_planes = + (bind_layout->samplers && + bind_layout->samplers[i].has_ycbcr_conversion) ? + vk_format_get_plane_count( + bind_layout->samplers[i].ycbcr_conversion_state.format) : + 1; + for (uint8_t p = 0; p < max_planes; p++) { + if (p < max_sampler_planes) { + add_bti_entry(map, set, b, i, p, bind_layout); + } else { + add_null_bti_entry(map); + } + } + } + + if (!(state->set[set].binding[b].properties & BINDING_PROPERTY_EMBEDDED_SAMPLER) && + (bind_layout->data & ANV_DESCRIPTOR_BTI_SAMPLER_STATE)) { + for (unsigned i = 0; i < bind_layout->array_size; i++) { + for (uint8_t p = 0; p < max_planes; p++) + add_sampler_entry(map, set, b, i, p, bind_layout); + } + } + } + } + } + } +} + static void build_packed_binding_table(struct apply_pipeline_layout_state *state, nir_shader *shader, @@ -2429,6 +2570,22 @@ build_descriptor_sets_bti_array(nir_builder *b, return set_to_bti; } +static nir_variable * +build_descriptor_sets_base_addr_array(nir_builder *b, + const struct apply_pipeline_layout_state *state) +{ + nir_variable *set_to_base_addr = nir_local_variable_create( + b->impl, glsl_array_type(glsl_uint_type(), MAX_SETS, 0), + "set_to_base_addr"); + + for (uint32_t i = 0; i < MAX_SETS; i++) { + nir_store_array_var(b, set_to_base_addr, nir_imm_int(b, i), + build_descriptor_set_base_address(b, i, state), 0x1); + } + + return set_to_base_addr; +} + static nir_variable * build_descriptor_sets_offset_array(nir_builder *b, const struct apply_pipeline_layout_state *state) @@ -2452,6 +2609,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader, struct anv_descriptor_set_layout * const *set_layouts, uint32_t set_count, const uint32_t *dynamic_offset_start, + bool device_bindable, struct anv_pipeline_bind_map *map, struct anv_pipeline_push_map *push_map, void *push_map_mem_ctx) @@ -2473,6 +2631,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader, .dynamic_offset_start = dynamic_offset_start, .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_flags), .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_flags), + .is_device_bindable = device_bindable, }; state.lowered_instrs = _mesa_pointer_set_create(state.mem_ctx); @@ -2491,8 +2650,10 @@ anv_nir_apply_pipeline_layout(nir_shader *shader, progress |= nir_shader_instructions_pass(shader, get_used_bindings, nir_metadata_all, &state); - /* Build the binding table */ - build_packed_binding_table(&state, shader, map, push_map, push_map_mem_ctx); + if (device_bindable) + build_device_bindable_binding_table(&state, shader, map, push_map, push_map_mem_ctx); + else + build_packed_binding_table(&state, shader, map, push_map, push_map_mem_ctx); /* Before we do the normal lowering, we look for any SSBO operations * that we can lower to the BTI model and lower them up-front. The BTI @@ -2527,6 +2688,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader, nir_foreach_function_impl(impl, shader) { nir_builder _b = nir_builder_at(nir_before_impl(impl)), *b = &_b; state.set_idx_to_bti = build_descriptor_sets_bti_array(b, &state); + state.set_idx_to_base_addr = build_descriptor_sets_base_addr_array(b, &state); state.set_idx_to_offset = build_descriptor_sets_offset_array(b, &state); progress |= nir_function_instructions_pass(impl, lower_direct_buffer_instr, diff --git a/src/intel/vulkan/anv_shader.c b/src/intel/vulkan/anv_shader.c index 7fa6506419b..72a1d607529 100644 --- a/src/intel/vulkan/anv_shader.c +++ b/src/intel/vulkan/anv_shader.c @@ -564,6 +564,10 @@ anv_shader_set_relocs(struct anv_device *device, .id = BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH, .value = device->physical->va.dynamic_visible_pool.addr >> 32, }; + reloc_values[rv_count++] = (struct intel_shader_reloc_value) { + .id = BRW_SHADER_RELOC_PUSH_DESCRIPTORS_BUFFER_ADDR_HIGH, + .value = device->physical->va.internal_surface_state_pool.addr >> 32, + }; assert((device->physical->va.indirect_descriptor_pool.addr & 0xffffffff) == 0); assert((device->physical->va.internal_surface_state_pool.addr & 0xffffffff) == 0); reloc_values[rv_count++] = (struct intel_shader_reloc_value) { diff --git a/src/intel/vulkan/anv_shader_compile.c b/src/intel/vulkan/anv_shader_compile.c index d182eb17a66..e6129dc2f1d 100644 --- a/src/intel/vulkan/anv_shader_compile.c +++ b/src/intel/vulkan/anv_shader_compile.c @@ -1613,8 +1613,9 @@ anv_shader_lower_nir(struct anv_device *device, pdevice, shader_data->key.base.robust_flags, set_layouts, set_layout_count, (shader_data->info->flags & - VK_SHADER_CREATE_INDEPENDENT_SETS_BIT_MESA) ? NULL: + VK_SHADER_CREATE_INDEPENDENT_SETS_BIT_MESA) ? NULL : dynamic_descriptors_offsets, + shader_data->info->flags & VK_SHADER_CREATE_INDIRECT_BINDABLE_BIT_EXT, &shader_data->bind_map, &shader_data->push_map, mem_ctx); }