From e94cb92cb00acaeb4c627cf186b6b86a6a1828ed Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Thu, 22 May 2025 15:43:47 +0300 Subject: [PATCH] anv: use internal surface state on Gfx12.5+ to access descriptor buffers As a result on Gfx12.5+ we're not holding any binding table entry to access descriptor buffers. This should reduce the amount of binding table allocations. Signed-off-by: Lionel Landwerlin Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10711 Reviewed-by: Kenneth Graunke Part-of: --- src/intel/vulkan/anv_cmd_buffer.c | 31 +- src/intel/vulkan/anv_nir.h | 37 ++ .../vulkan/anv_nir_apply_pipeline_layout.c | 222 +++++--- .../vulkan/anv_nir_compute_push_layout.c | 526 +++++++++++------- .../vulkan/anv_nir_lower_resource_intel.c | 3 + .../vulkan/anv_nir_push_constants_analysis.c | 336 +++++++++++ .../vulkan/anv_nir_push_descriptor_analysis.c | 4 + src/intel/vulkan/genX_cmd_buffer.c | 4 + src/intel/vulkan/meson.build | 1 + 9 files changed, 865 insertions(+), 299 deletions(-) create mode 100644 src/intel/vulkan/anv_nir_push_constants_analysis.c diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 12cfd41c7d2..a889b8ca8a1 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -649,24 +649,10 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.descriptors_dirty |= stages; cmd_buffer->state.descriptor_buffers.offsets_dirty |= stages; } else { - /* When using indirect descriptors, stages that have access to the HW - * binding tables, never need to access the - * anv_push_constants::desc_offsets fields, because any data they - * need from the descriptor buffer is accessible through a binding - * table entry. For stages that are "bindless" (Mesh/Task/RT), we - * need to provide anv_push_constants::desc_offsets matching the - * bound descriptor so that shaders can access the descriptor buffer - * through A64 messages. - * - * With direct descriptors, the shaders can use the - * anv_push_constants::desc_offsets to build bindless offsets. So - * it's we always need to update the push constant data. + /* Plaforms with LSC will use descriptor buffer push constant + * offsets */ - bool update_desc_sets = - !cmd_buffer->device->physical->indirect_descriptors || - (stages & (VK_SHADER_STAGE_TASK_BIT_EXT | - VK_SHADER_STAGE_MESH_BIT_EXT | - ANV_RT_STAGE_BITS)); + bool update_desc_sets = cmd_buffer->device->info->has_lsc; if (update_desc_sets) { struct anv_push_constants *push = &pipe_state->push_constants; @@ -679,14 +665,15 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer, push->desc_sampler_offsets[set_index] = anv_address_physical(set->desc_sampler_addr) - cmd_buffer->device->physical->va.dynamic_state_pool.addr; - - anv_reloc_list_add_bo(cmd_buffer->batch.relocs, - set->desc_surface_addr.bo); - anv_reloc_list_add_bo(cmd_buffer->batch.relocs, - set->desc_sampler_addr.bo); } } + /* Always add a reference to the buffers */ + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + set->desc_surface_addr.bo); + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + set->desc_sampler_addr.bo); + dirty_stages |= stages; } diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h index 52aed745257..56f22ff1deb 100644 --- a/src/intel/vulkan/anv_nir.h +++ b/src/intel/vulkan/anv_nir.h @@ -133,6 +133,9 @@ void anv_nir_validate_push_layout(const struct anv_physical_device *pdevice, bool anv_nir_update_resource_intel_block(nir_shader *shader); +bool anv_nir_lower_desc_address(nir_shader *shader, + const struct anv_pipeline_bind_map *map); + bool anv_nir_lower_unaligned_dispatch(nir_shader *shader); bool anv_nir_lower_resource_intel(nir_shader *shader, @@ -159,6 +162,40 @@ void anv_apply_per_prim_attr_wa(struct nir_shader *ms_nir, struct nir_shader *fs_nir, struct anv_device *device); +static inline bool +anv_nir_is_promotable_ubo_binding(nir_src src) +{ + nir_intrinsic_instr *intrin = nir_src_as_intrinsic(src); + + return intrin && intrin->intrinsic == nir_intrinsic_resource_intel && + (nir_intrinsic_resource_access_intel(intrin) & + nir_resource_intel_pushable); +} + +static inline bool +anv_nir_is_internal_ubo(nir_src src) +{ + nir_intrinsic_instr *intrin = nir_src_as_intrinsic(src); + + return intrin && intrin->intrinsic == nir_intrinsic_resource_intel && + (nir_intrinsic_resource_access_intel(intrin) & + nir_resource_intel_internal); +} + +static inline unsigned +anv_nir_get_ubo_binding_push_block(nir_src src) +{ + nir_intrinsic_instr *intrin = nir_src_as_intrinsic(src); + assert(intrin && intrin->intrinsic == nir_intrinsic_resource_intel); + + return nir_intrinsic_resource_block_intel(intrin); +} + +void anv_nir_analyze_push_constants_ranges(nir_shader *nir, + const struct intel_device_info *devinfo, + const struct anv_pipeline_push_map *push_map, + struct anv_push_range out_ranges[4]); + #ifdef __cplusplus } #endif diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index 77f6b76a019..5adba88d6dc 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -72,6 +72,7 @@ struct apply_pipeline_layout_state { struct { bool desc_buffer_used; uint8_t desc_offset; + uint32_t push_block; struct anv_binding_apply_layout { uint8_t use_count; @@ -389,35 +390,17 @@ build_load_descriptor_mem(nir_builder *b, const struct apply_pipeline_layout_state *state) { - switch (state->desc_addr_format) { - case nir_address_format_64bit_global_32bit_offset: { - nir_def *base_addr = - nir_pack_64_2x32(b, nir_trim_vector(b, desc_addr, 2)); - nir_def *offset32 = - nir_iadd_imm(b, nir_channel(b, desc_addr, 3), desc_offset); + assert(state->desc_addr_format == nir_address_format_32bit_index_offset); - return nir_load_global_constant_offset(b, num_components, bit_size, - base_addr, offset32, - .align_mul = 8, - .align_offset = desc_offset % 8); - } + nir_def *surface_index = nir_channel(b, desc_addr, 0); + nir_def *offset32 = nir_iadd_imm(b, nir_channel(b, desc_addr, 1), desc_offset); - case nir_address_format_32bit_index_offset: { - nir_def *surface_index = nir_channel(b, desc_addr, 0); - nir_def *offset32 = - nir_iadd_imm(b, nir_channel(b, desc_addr, 1), desc_offset); - - return nir_load_ubo(b, num_components, bit_size, - surface_index, offset32, - .align_mul = 8, - .align_offset = desc_offset % 8, - .range_base = 0, - .range = num_components * bit_size / 8); - } - - default: - UNREACHABLE("Unsupported address format"); - } + return nir_load_ubo(b, num_components, bit_size, + surface_index, offset32, + .align_mul = 8, + .align_offset = desc_offset % 8, + .range_base = 0, + .range = num_components * bit_size / 8); } /* When using direct descriptor, we do not have a structure to read in memory @@ -635,11 +618,14 @@ build_desc_address64(nir_builder *b, nir_def *set_idx, unsigned set_idx_imm, /** Build a 32bit_index_offset address for a descriptor set */ static nir_def * build_desc_address32(nir_builder *b, - nir_def *set_idx, nir_def *offset, + nir_def *set_idx, unsigned set, + nir_def *offset, const struct apply_pipeline_layout_state *state) { return nir_vec2(b, - nir_vector_extract(b, state->set_idx_to_bti, set_idx), + nir_vector_extract( + b, state->set_idx_to_bti, + set < MAX_SETS ? nir_imm_int(b, set) : set_idx), offset); } @@ -820,7 +806,7 @@ build_desc_addr_for_res_index(nir_builder *b, } case nir_address_format_32bit_index_offset: - return build_desc_address32(b, res.set_idx, desc_offset, state); + return build_desc_address32(b, res.set_idx, UINT32_MAX, desc_offset, state); default: UNREACHABLE("Unhandled address format"); @@ -830,7 +816,7 @@ build_desc_addr_for_res_index(nir_builder *b, case nir_address_format_32bit_index_offset: assert(desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK); assert(state->desc_addr_format == nir_address_format_32bit_index_offset); - return build_desc_address32(b, res.set_idx, desc_offset, state); + return build_desc_address32(b, res.set_idx, UINT32_MAX, desc_offset, state); default: UNREACHABLE("Unhandled address format"); @@ -878,7 +864,7 @@ build_desc_addr_for_binding(nir_builder *b, desc_offset = nir_iadd_imm( b, desc_offset, plane * bind_layout->descriptor_data_surface_size); } - return build_desc_address32(b, nir_imm_int(b, set), desc_offset, state); + return build_desc_address32(b, NULL, set, desc_offset, state); } default: @@ -1245,10 +1231,10 @@ build_buffer_addr_for_binding(nir_builder *b, if (addr_format != nir_address_format_32bit_index_offset) return build_buffer_addr_for_res_index(b, desc_type, res_index, addr_format, state); - if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - const struct anv_descriptor_set_binding_layout *bind_layout = - &state->set_layouts[set]->binding[binding]; - return build_desc_address32(b, nir_imm_int(b, set), + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->set_layouts[set]->binding[binding]; + if (bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + return build_desc_address32(b, NULL, set, nir_imm_int(b, bind_layout->descriptor_surface_offset), state); } @@ -1426,9 +1412,6 @@ try_lower_direct_buffer_intrinsic(nir_builder *b, if (state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT && !descriptor_has_bti(desc, state)) return false; - - /* Rewrite to 32bit_index_offset whenever we can */ - addr_format = nir_address_format_32bit_index_offset; } else { assert(nir_deref_mode_is(deref, nir_var_mem_ubo)); @@ -1444,15 +1427,11 @@ try_lower_direct_buffer_intrinsic(nir_builder *b, bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK && !descriptor_has_bti(desc, state)) return false; - - /* If this is an inline uniform and the shader stage is bindless, we - * can't switch to 32bit_index_offset. - */ - if (bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK || - !brw_shader_stage_requires_bindless_resources(b->shader->info.stage)) - addr_format = nir_address_format_32bit_index_offset; } + /* Rewrite to 32bit_index_offset whenever we can */ + addr_format = nir_address_format_32bit_index_offset; + /* If a dynamic has not been assigned a binding table entry, we need to * bail here. */ @@ -2072,7 +2051,7 @@ binding_is_promotable_to_push(const struct anv_descriptor_set_layout *set_layout return (bind_layout->flags & non_pushable_binding_flags) == 0; } -static void +static uint32_t add_null_bti_entry(struct anv_pipeline_bind_map *map) { map->surface_to_descriptor[map->surface_count++] = @@ -2080,9 +2059,25 @@ add_null_bti_entry(struct anv_pipeline_bind_map *map) .set = ANV_DESCRIPTOR_SET_NULL, }; assert(map->surface_count <= MAX_BINDING_TABLE_SIZE); + return map->surface_count - 1; } -static void +static uint32_t +add_desc_bti_entry(struct anv_pipeline_bind_map *map, + uint32_t set) +{ + map->surface_to_descriptor[map->surface_count++] = + (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_DESCRIPTORS, + .binding = UINT32_MAX, + .index = set, + }; + assert(map->surface_count <= MAX_BINDING_TABLE_SIZE); + + return map->surface_count - 1; +} + +static uint32_t add_bti_entry(struct anv_pipeline_bind_map *map, uint32_t set, uint32_t binding, @@ -2101,9 +2096,11 @@ add_bti_entry(struct anv_pipeline_bind_map *map, .plane = plane, }; assert(map->surface_count <= MAX_BINDING_TABLE_SIZE); + + return map->surface_count - 1; } -static void +static uint32_t add_dynamic_bti_entry(struct anv_pipeline_bind_map *map, uint32_t set, uint32_t binding, @@ -2120,6 +2117,8 @@ add_dynamic_bti_entry(struct anv_pipeline_bind_map *map, .dynamic_offset_index = bind_layout->dynamic_offset_index + element, }; assert(map->surface_count <= MAX_BINDING_TABLE_SIZE); + + return map->surface_count - 1; } static void @@ -2139,6 +2138,19 @@ add_sampler_entry(struct anv_pipeline_bind_map *map, }; } +static void +add_descriptor_push_entry(struct anv_pipeline_push_map *push_map, + uint32_t set, + struct anv_pipeline_bind_map *map) +{ + push_map->block_to_descriptor[push_map->block_count++] = + (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_DESCRIPTORS, + .binding = UINT32_MAX, + .index = set, + }; +} + static void add_push_entry(struct anv_pipeline_push_map *push_map, uint32_t set, @@ -2218,7 +2230,7 @@ build_packed_binding_table(struct apply_pipeline_layout_state *state, void *push_map_mem_ctx) { /* Compute the amount of push block items required. */ - unsigned push_block_count = 0; + unsigned push_block_count = map->surface_count + MAX_SETS; for (unsigned s = 0; s < state->set_count; s++) { const struct anv_descriptor_set_layout *set_layout = state->set_layouts[s]; @@ -2231,31 +2243,37 @@ build_packed_binding_table(struct apply_pipeline_layout_state *state, } } - /* Assign a BTI to each used descriptor set */ - for (unsigned s = 0; s < state->set_count; s++) { - if (state->desc_addr_format != nir_address_format_32bit_index_offset) { - state->set[s].desc_offset = BINDLESS_OFFSET; - } else if (state->set[s].desc_buffer_used) { - map->surface_to_descriptor[map->surface_count] = - (struct anv_pipeline_binding) { - .set = ANV_DESCRIPTOR_SET_DESCRIPTORS, - .binding = UINT32_MAX, - .index = s, - }; - state->set[s].desc_offset = map->surface_count++; - } - } - /* Assign a block index for each surface */ - push_map->block_to_descriptor = - rzalloc_array(push_map_mem_ctx, struct anv_pipeline_binding, - map->surface_count + push_block_count); + push_map->block_to_descriptor = rzalloc_array(push_map_mem_ctx, + struct anv_pipeline_binding, + push_block_count); memcpy(push_map->block_to_descriptor, map->surface_to_descriptor, sizeof(push_map->block_to_descriptor[0]) * map->surface_count); push_map->block_count = map->surface_count; + /* Assign a BTI to each used descriptor set */ + for (unsigned s = 0; s < state->set_count; s++) { + if (state->set[s].desc_buffer_used) { + /* Only add a binding table entry on platform that cannot use + * LSC_ADDR_SURFTYPE_SS. + */ + if (!state->pdevice->info.has_lsc) + state->set[s].desc_offset = add_desc_bti_entry(map, s); + + if (brw_shader_stage_requires_bindless_resources(shader->info.stage)) { + state->set[s].push_block = UINT32_MAX; + } else { + state->set[s].push_block = push_map->block_count; + add_descriptor_push_entry(push_map, s, state->bind_map); + } + } else { + state->set[s].desc_offset = BINDLESS_OFFSET; + state->set[s].push_block = UINT32_MAX; + } + } + /* Count used bindings, assign embedded sampler indices & add push blocks * for promotion to push constants */ @@ -2428,19 +2446,59 @@ build_packed_binding_table(struct apply_pipeline_layout_state *state, } static nir_def * -build_descriptor_bti_vec(nir_builder *b, +build_descriptor_set_bti(nir_builder *b, + uint32_t set, const struct apply_pipeline_layout_state *state) +{ + if (state->pdevice->info.has_lsc) { + nir_def *surface_handle = + nir_load_reloc_const_intel( + b, + state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER ? + BRW_SHADER_RELOC_DESCRIPTORS_BUFFERS_VIEW_HANDLE : + BRW_SHADER_RELOC_DESCRIPTORS_VIEW_HANDLE); + + return nir_resource_intel( + b, + nir_imm_int(b, set), + surface_handle, + nir_iand_imm(b, + anv_load_driver_uniform(b, 1, desc_surface_offsets[set]), + ANV_DESCRIPTOR_SET_OFFSET_MASK) /* array_index */, + nir_imm_int(b, 0) /* bindless_base_offset */, + .desc_set = set, + .binding = -1, + .resource_block_intel = state->set[set].push_block, + .resource_access_intel = nir_resource_intel_pushable | + nir_resource_intel_internal); + } else { + return nir_resource_intel( + b, + nir_imm_int(b, set), + nir_imm_int(b, state->set[set].desc_offset), + nir_imm_int(b, 0) /* array_index */, + nir_imm_int(b, 0) /* bindless_base_offset */, + .desc_set = set, + .binding = -1, + .resource_block_intel = state->set[set].desc_offset, + .resource_access_intel = nir_resource_intel_pushable); + } +} + +static nir_def * +build_descriptor_sets_bti_vec(nir_builder *b, + const struct apply_pipeline_layout_state *state) { STATIC_ASSERT(MAX_SETS == 8); return nir_vec8(b, - nir_imm_int(b, state->set[0].desc_offset), - nir_imm_int(b, state->set[1].desc_offset), - nir_imm_int(b, state->set[2].desc_offset), - nir_imm_int(b, state->set[3].desc_offset), - nir_imm_int(b, state->set[4].desc_offset), - nir_imm_int(b, state->set[5].desc_offset), - nir_imm_int(b, state->set[6].desc_offset), - nir_imm_int(b, state->set[7].desc_offset)); + build_descriptor_set_bti(b, 0, state), + build_descriptor_set_bti(b, 1, state), + build_descriptor_set_bti(b, 2, state), + build_descriptor_set_bti(b, 3, state), + build_descriptor_set_bti(b, 4, state), + build_descriptor_set_bti(b, 5, state), + build_descriptor_set_bti(b, 6, state), + build_descriptor_set_bti(b, 7, state)); } bool @@ -2462,8 +2520,6 @@ anv_nir_apply_pipeline_layout(nir_shader *shader, anv_validate_pipeline_layout(set_layouts, set_count, shader); #endif - const bool bindless_stage = - brw_shader_stage_requires_bindless_resources(shader->info.stage); struct apply_pipeline_layout_state state = { .mem_ctx = ralloc_context(NULL), .pdevice = pdevice, @@ -2471,9 +2527,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader, .set_layouts = set_layouts, .set_count = set_count, .dynamic_offset_start = dynamic_offset_start, - .desc_addr_format = bindless_stage ? - nir_address_format_64bit_global_32bit_offset : - nir_address_format_32bit_index_offset, + .desc_addr_format = nir_address_format_32bit_index_offset, .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_flags), .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_flags), }; @@ -2529,7 +2583,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader, */ nir_foreach_function_impl(impl, shader) { nir_builder _b = nir_builder_at(nir_before_impl(impl)), *b = &_b; - state.set_idx_to_bti = build_descriptor_bti_vec(b, &state); + state.set_idx_to_bti = build_descriptor_sets_bti_vec(b, &state); progress |= nir_function_instructions_pass(impl, lower_direct_buffer_instr, nir_metadata_control_flow, @@ -2543,7 +2597,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader, nir_foreach_function_impl(impl, shader) { nir_builder _b = nir_builder_at(nir_before_impl(impl)), *b = &_b; - state.set_idx_to_bti = build_descriptor_bti_vec(b, &state); + state.set_idx_to_bti = build_descriptor_sets_bti_vec(b, &state); progress |= nir_function_instructions_pass(impl, apply_pipeline_layout, nir_metadata_control_flow, diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index 59cc3711841..6707a169f48 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -25,111 +25,90 @@ #include "nir_builder.h" #include "compiler/brw/brw_nir.h" #include "util/mesa-sha1.h" +#include "util/set.h" -struct lower_to_push_data_intel_state { - const struct anv_pipeline_bind_map *bind_map; - const struct anv_pipeline_push_map *push_map; +struct push_data { + bool push_ubo_ranges; + bool needs_wa_18019110168; + bool needs_dyn_tess_config; + unsigned app_start, app_end; + unsigned driver_start, driver_end; }; -static bool -lower_to_push_data_intel(nir_builder *b, - nir_intrinsic_instr *intrin, - void *data) +static void +adjust_driver_push_values(nir_shader *nir, + enum brw_robustness_flags robust_flags, + const struct anv_nir_push_layout_info *push_info, + struct brw_base_prog_key *prog_key, + const struct intel_device_info *devinfo, + struct push_data *data) { - const struct lower_to_push_data_intel_state *state = data; - /* With bindless shaders we load uniforms with SEND messages. All the push - * constants are located after the RT_DISPATCH_GLOBALS. We just need to add - * the offset to the address right after RT_DISPATCH_GLOBALS (see - * brw_nir_lower_rt_intrinsics.c). - */ - const unsigned base_offset = - brw_shader_stage_is_bindless(b->shader->info.stage) ? - 0 : state->bind_map->push_ranges[0].start * 32; - - switch (intrin->intrinsic) { - case nir_intrinsic_load_push_data_intel: { - nir_intrinsic_set_base(intrin, nir_intrinsic_base(intrin) - base_offset); - return true; + if (data->push_ubo_ranges && (robust_flags & BRW_ROBUSTNESS_UBO)) { + /* We can't on-the-fly adjust our push ranges because doing so would + * mess up the layout in the shader. When robustBufferAccess is + * enabled, we push a mask into the shader indicating which pushed + * registers are valid and we zero out the invalid ones at the top of + * the shader. + */ + const uint32_t push_reg_mask_start = + anv_drv_const_offset(gfx.push_reg_mask[nir->info.stage]); + const uint32_t push_reg_mask_end = + push_reg_mask_start + + anv_drv_const_size(gfx.push_reg_mask[nir->info.stage]); + data->driver_start = MIN2(data->driver_start, push_reg_mask_start); + data->driver_end = MAX2(data->driver_end, push_reg_mask_end); } - case nir_intrinsic_load_push_constant: { - b->cursor = nir_before_instr(&intrin->instr); - nir_def *data = nir_load_push_data_intel( - b, - intrin->def.num_components, - intrin->def.bit_size, - intrin->src[0].ssa, - .base = nir_intrinsic_base(intrin) - base_offset, - .range = nir_intrinsic_range(intrin)); - nir_def_replace(&intrin->def, data); - return true; - } - - case nir_intrinsic_load_ubo: { - if (!brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) || - !nir_src_is_const(intrin->src[1])) - return false; - - const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]); - const unsigned byte_offset = nir_src_as_uint(intrin->src[1]); - const unsigned num_components = - nir_def_last_component_read(&intrin->def) + 1; - const int bytes = num_components * (intrin->def.bit_size / 8); - - const struct anv_pipeline_binding *binding = - &state->push_map->block_to_descriptor[block]; - - uint32_t range_offset = 0; - const struct anv_push_range *push_range = NULL; - for (uint32_t i = 0; i < 4; i++) { - if (state->bind_map->push_ranges[i].set == binding->set && - state->bind_map->push_ranges[i].index == binding->index && - byte_offset >= state->bind_map->push_ranges[i].start * 32 && - (byte_offset + bytes) <= (state->bind_map->push_ranges[i].start + - state->bind_map->push_ranges[i].length) * 32) { - push_range = &state->bind_map->push_ranges[i]; - break; - } else { - range_offset += state->bind_map->push_ranges[i].length * 32; - } + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + if (push_info->fragment_dynamic) { + const uint32_t fs_config_start = anv_drv_const_offset(gfx.fs_config); + const uint32_t fs_config_end = fs_config_start + + anv_drv_const_size(gfx.fs_config); + data->driver_start = MIN2(data->driver_start, fs_config_start); + data->driver_end = MAX2(data->driver_end, fs_config_end); } - if (push_range == NULL) - return false; - - b->cursor = nir_before_instr(&intrin->instr); - nir_def *data = nir_load_push_data_intel( - b, - nir_def_last_component_read(&intrin->def) + 1, - intrin->def.bit_size, - nir_imm_int(b, 0), - .base = range_offset + byte_offset - push_range->start * 32, - .range = nir_intrinsic_range(intrin)); - nir_def_replace(&intrin->def, data); - return true; + if (data->needs_wa_18019110168) { + const uint32_t fs_per_prim_remap_start = + anv_drv_const_offset(gfx.fs_per_prim_remap_offset); + const uint32_t fs_per_prim_remap_end = + fs_per_prim_remap_start + + anv_drv_const_size(gfx.fs_per_prim_remap_offset); + data->driver_start = MIN2(data->driver_start, fs_per_prim_remap_start); + data->driver_end = MAX2(data->driver_end, fs_per_prim_remap_end); + } } - default: - return false; + data->needs_dyn_tess_config = + (nir->info.stage == MESA_SHADER_TESS_CTRL && + (container_of(prog_key, struct brw_tcs_prog_key, base)->input_vertices == 0 || + push_info->separate_tessellation)) || + (nir->info.stage == MESA_SHADER_TESS_EVAL && + push_info->separate_tessellation); + if (data->needs_dyn_tess_config) { + const uint32_t tess_config_start = anv_drv_const_offset(gfx.tess_config); + const uint32_t tess_config_end = tess_config_start + + anv_drv_const_size(gfx.tess_config); + data->driver_start = MIN2(data->driver_start, tess_config_start); + data->driver_end = MAX2(data->driver_end, tess_config_end); } } -bool -anv_nir_compute_push_layout(nir_shader *nir, - const struct anv_physical_device *pdevice, - enum brw_robustness_flags robust_flags, - const struct anv_nir_push_layout_info *push_info, - struct brw_base_prog_key *prog_key, - struct brw_stage_prog_data *prog_data, - struct anv_pipeline_bind_map *map, - const struct anv_pipeline_push_map *push_map) +static struct push_data +gather_push_data(nir_shader *nir, + enum brw_robustness_flags robust_flags, + const struct intel_device_info *devinfo, + const struct anv_nir_push_layout_info *push_info, + struct brw_base_prog_key *prog_key, + struct anv_pipeline_bind_map *map, + struct set *lowered_ubo_instrs) { - const struct brw_compiler *compiler = pdevice->compiler; - const struct intel_device_info *devinfo = compiler->devinfo; - memset(map->push_ranges, 0, sizeof(map->push_ranges)); - bool has_const_ubo = false; - unsigned push_start = UINT_MAX, push_end = 0; + struct push_data data = { + .app_start = UINT_MAX, .app_end = 0, + .driver_start = UINT_MAX, .driver_end = 0, + }; + nir_foreach_function_impl(impl, nir) { nir_foreach_block(block, impl) { nir_foreach_instr(instr, block) { @@ -144,12 +123,26 @@ anv_nir_compute_push_layout(nir_shader *nir, has_const_ubo = true; break; - case nir_intrinsic_load_push_constant: - case nir_intrinsic_load_push_data_intel: { + + case nir_intrinsic_load_push_constant: { unsigned base = nir_intrinsic_base(intrin); unsigned range = nir_intrinsic_range(intrin); - push_start = MIN2(push_start, base); - push_end = MAX2(push_end, base + range); + data.app_start = MIN2(data.app_start, base); + data.app_end = MAX2(data.app_end, base + range); + break; + } + + case nir_intrinsic_load_push_data_intel: { + if (lowered_ubo_instrs && + _mesa_set_search(lowered_ubo_instrs, intrin)) { + has_const_ubo = true; + break; + } + + unsigned base = nir_intrinsic_base(intrin); + unsigned range = nir_intrinsic_range(intrin); + data.driver_start = MIN2(data.driver_start, base); + data.driver_end = MAX2(data.driver_end, base + range); /* We need to retain this information to update the push * constant on vkCmdDispatch*(). */ @@ -167,67 +160,161 @@ anv_nir_compute_push_layout(nir_shader *nir, } } - const bool push_ubo_ranges = + data.push_ubo_ranges = has_const_ubo && nir->info.stage != MESA_SHADER_COMPUTE && !brw_shader_stage_requires_bindless_resources(nir->info.stage); - const bool needs_wa_18019110168 = + data.needs_wa_18019110168 = nir->info.stage == MESA_SHADER_FRAGMENT && brw_nir_fragment_shader_needs_wa_18019110168( devinfo, push_info->mesh_dynamic ? INTEL_SOMETIMES : INTEL_NEVER, nir); - if (push_ubo_ranges && (robust_flags & BRW_ROBUSTNESS_UBO)) { - /* We can't on-the-fly adjust our push ranges because doing so would - * mess up the layout in the shader. When robustBufferAccess is - * enabled, we push a mask into the shader indicating which pushed - * registers are valid and we zero out the invalid ones at the top of - * the shader. + adjust_driver_push_values(nir, robust_flags, push_info, + prog_key, devinfo, &data); + + return data; +} + +struct lower_to_push_data_intel_state { + const struct anv_pipeline_bind_map *bind_map; + const struct anv_pipeline_push_map *push_map; + + struct set *lowered_ubo_instrs; + + /* Amount that should be subtracted to UBOs loads converted to + * push_data_intel (in lowered_ubo_instrs) + */ + unsigned reduced_push_ranges; +}; + +/* Lower internal UBOs, only used for descriptor buffer loads when the offset + * is dynamic. We need to add the base offset of the descriptor buffer to the + * offset relative to the descriptor set. + */ +static bool +lower_internal_ubo(nir_builder *b, + nir_intrinsic_instr *intrin) +{ + if (!anv_nir_is_internal_ubo(intrin->src[0])) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + + nir_intrinsic_instr *resource = nir_src_as_intrinsic(intrin->src[0]); + + /* Add the descriptor offset from the resource array_index source to the + * relative offset. + */ + nir_src_rewrite(&intrin->src[1], + nir_iadd(b, resource->src[2].ssa, intrin->src[1].ssa)); + + return true; +} + +static bool +lower_ubo_to_push_data_intel(nir_builder *b, + nir_intrinsic_instr *intrin, + void *_data) +{ + if (intrin->intrinsic != nir_intrinsic_load_ubo) + return false; + + if (!anv_nir_is_promotable_ubo_binding(intrin->src[0]) || + !nir_src_is_const(intrin->src[1]) || + brw_shader_stage_requires_bindless_resources(b->shader->info.stage)) + return lower_internal_ubo(b, intrin); + + const struct lower_to_push_data_intel_state *state = _data; + const int block = anv_nir_get_ubo_binding_push_block(intrin->src[0]); + assert(block < state->push_map->block_count); + const struct anv_pipeline_binding *binding = + &state->push_map->block_to_descriptor[block]; + const unsigned byte_offset = nir_src_as_uint(intrin->src[1]); + const unsigned num_components = + nir_def_last_component_read(&intrin->def) + 1; + const int bytes = num_components * (intrin->def.bit_size / 8); + + uint32_t range_offset = 0; + const struct anv_push_range *push_range = NULL; + for (uint32_t i = 0; i < 4; i++) { + if (state->bind_map->push_ranges[i].set == binding->set && + state->bind_map->push_ranges[i].index == binding->index && + byte_offset >= state->bind_map->push_ranges[i].start * 32 && + (byte_offset + bytes) <= (state->bind_map->push_ranges[i].start + + state->bind_map->push_ranges[i].length) * 32) { + push_range = &state->bind_map->push_ranges[i]; + break; + } else { + range_offset += state->bind_map->push_ranges[i].length * 32; + } + } + + if (push_range == NULL) + return lower_internal_ubo(b, intrin); + + b->cursor = nir_before_instr(&intrin->instr); + nir_def *data = nir_load_push_data_intel( + b, + nir_def_last_component_read(&intrin->def) + 1, + intrin->def.bit_size, + nir_imm_int(b, 0), + .base = range_offset + byte_offset - push_range->start * 32, + .range = nir_intrinsic_range(intrin)); + nir_def_replace(&intrin->def, data); + + _mesa_set_add(state->lowered_ubo_instrs, nir_def_as_intrinsic(data)); + + return true; +} + +static bool +lower_to_push_data_intel(nir_builder *b, + nir_intrinsic_instr *intrin, + void *_data) +{ + const struct lower_to_push_data_intel_state *state = _data; + /* With bindless shaders we load uniforms with SEND messages. All the push + * constants are located after the RT_DISPATCH_GLOBALS. We just need to add + * the offset to the address right after RT_DISPATCH_GLOBALS (see + * brw_nir_lower_rt_intrinsics.c). + */ + const unsigned base_offset = + brw_shader_stage_is_bindless(b->shader->info.stage) ? + 0 : state->bind_map->push_ranges[0].start * 32; + + switch (intrin->intrinsic) { + case nir_intrinsic_load_push_data_intel: + /* For lowered UBOs to push constants, shrink the base by the amount we + * shrunk the driver push constants. */ - const uint32_t push_reg_mask_start = - anv_drv_const_offset(gfx.push_reg_mask[nir->info.stage]); - const uint32_t push_reg_mask_end = - push_reg_mask_start + - anv_drv_const_size(gfx.push_reg_mask[nir->info.stage]); - push_start = MIN2(push_start, push_reg_mask_start); - push_end = MAX2(push_end, push_reg_mask_end); + if (_mesa_set_search(state->lowered_ubo_instrs, intrin)) + nir_intrinsic_set_base(intrin, nir_intrinsic_base(intrin) - state->reduced_push_ranges); + else + nir_intrinsic_set_base(intrin, nir_intrinsic_base(intrin) - base_offset); + return true; + + case nir_intrinsic_load_push_constant: { + b->cursor = nir_before_instr(&intrin->instr); + nir_def *data = nir_load_push_data_intel( + b, + intrin->def.num_components, + intrin->def.bit_size, + intrin->src[0].ssa, + .base = nir_intrinsic_base(intrin) - base_offset, + .range = nir_intrinsic_range(intrin)); + nir_def_replace(&intrin->def, data); + return true; } - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - if (push_info->fragment_dynamic) { - const uint32_t fs_config_start = - anv_drv_const_offset(gfx.fs_config); - const uint32_t fs_config_end = - fs_config_start + - anv_drv_const_size(gfx.fs_config); - push_start = MIN2(push_start, fs_config_start); - push_end = MAX2(push_end, fs_config_end); - } - - if (needs_wa_18019110168) { - const uint32_t fs_per_prim_remap_start = - anv_drv_const_offset(gfx.fs_per_prim_remap_offset); - const uint32_t fs_per_prim_remap_end = - fs_per_prim_remap_start + - anv_drv_const_size(gfx.fs_per_prim_remap_offset); - push_start = MIN2(push_start, fs_per_prim_remap_start); - push_end = MAX2(push_end, fs_per_prim_remap_end); - } - } - - const bool needs_dyn_tess_config = - (nir->info.stage == MESA_SHADER_TESS_CTRL && - (container_of(prog_key, struct brw_tcs_prog_key, base)->input_vertices == 0 || - push_info->separate_tessellation)) || - (nir->info.stage == MESA_SHADER_TESS_EVAL && - push_info->separate_tessellation); - if (needs_dyn_tess_config) { - const uint32_t tess_config_start = anv_drv_const_offset(gfx.tess_config); - const uint32_t tess_config_end = tess_config_start + - anv_drv_const_size(gfx.tess_config); - push_start = MIN2(push_start, tess_config_start); - push_end = MAX2(push_end, tess_config_end); + default: + return false; } +} +static struct anv_push_range +compute_final_push_range(const struct intel_device_info *devinfo, + const struct push_data *data) +{ /* Align push_start down to a 32B (for 3DSTATE_CONSTANT) and make it no * larger than push_end (no push constants is indicated by push_start = * UINT_MAX). @@ -254,14 +341,50 @@ anv_nir_compute_push_layout(nir_shader *nir, * (unlike all Gfx stages) and so we can bound+align the allocation there * (see anv_cmd_buffer_cs_push_constants). */ - push_start = MIN2(push_start, push_end); + unsigned push_start = UINT32_MAX; + + if (data->app_end != 0) + push_start = MIN2(push_start, data->app_start); + if (data->driver_end != 0) + push_start = MIN2(push_start, data->driver_start); + + if (push_start == UINT32_MAX) { + return (struct anv_push_range) { + .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, + }; + } + push_start = ROUND_DOWN_TO(push_start, 32); - const struct anv_push_range push_constant_range = { + const unsigned push_size = align( + MAX2(data->app_end, data->driver_end) - push_start, devinfo->grf_size); + + return (struct anv_push_range) { .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, .start = push_start / 32, - .length = align(push_end - push_start, devinfo->grf_size) / 32, + .length = push_size / 32, }; +} + +bool +anv_nir_compute_push_layout(nir_shader *nir, + const struct anv_physical_device *pdevice, + enum brw_robustness_flags robust_flags, + const struct anv_nir_push_layout_info *push_info, + struct brw_base_prog_key *prog_key, + struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map, + const struct anv_pipeline_push_map *push_map) +{ + const struct brw_compiler *compiler = pdevice->compiler; + const struct intel_device_info *devinfo = compiler->devinfo; + memset(map->push_ranges, 0, sizeof(map->push_ranges)); + + struct push_data data = + gather_push_data(nir, robust_flags, devinfo, push_info, prog_key, map, NULL); + + struct anv_push_range push_constant_range = + compute_final_push_range(devinfo, &data); /* When platforms support Mesh and the fragment shader is not fully linked * to the previous shader, payload format can change if the preceding @@ -288,54 +411,40 @@ anv_nir_compute_push_layout(nir_shader *nir, * dynamic bit in fs_config_intel. */ const bool needs_padding_per_primitive = - needs_wa_18019110168 || + data.needs_wa_18019110168 || (push_info->mesh_dynamic && (nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID)); unsigned n_push_ranges = 0; + unsigned total_push_regs = 0; - if (push_constant_range.length > 0) + if (push_constant_range.length > 0) { map->push_ranges[n_push_ranges++] = push_constant_range; + total_push_regs += push_constant_range.length; + } - if (push_ubo_ranges) { - struct brw_ubo_range ubo_ranges[4] = {}; + struct anv_push_range analysis_ranges[4] = {}; + if (data.push_ubo_ranges) { + anv_nir_analyze_push_constants_ranges(nir, devinfo, push_map, + analysis_ranges); + } - brw_nir_analyze_ubo_ranges(compiler, nir, ubo_ranges); + const unsigned max_push_buffers = needs_padding_per_primitive ? 3 : 4; + const unsigned max_push_regs = needs_padding_per_primitive ? 63 : 64; - const unsigned max_push_regs = 64; + for (unsigned i = 0; i < 4; i++) { + struct anv_push_range *candidate_range = &analysis_ranges[i]; + if (n_push_ranges >= max_push_buffers) + break; - unsigned total_push_regs = push_constant_range.length; - for (unsigned i = 0; i < 4; i++) { - if (total_push_regs + ubo_ranges[i].length > max_push_regs) - ubo_ranges[i].length = max_push_regs - total_push_regs; - total_push_regs += ubo_ranges[i].length; - } - assert(total_push_regs <= max_push_regs); + if (candidate_range->length + total_push_regs > max_push_regs) + candidate_range->length = max_push_regs - total_push_regs; - const unsigned max_push_buffers = needs_padding_per_primitive ? 3 : 4; + if (candidate_range->length == 0) + break; - for (unsigned i = 0; i < 4; i++) { - struct brw_ubo_range *ubo_range = &ubo_ranges[i]; - if (ubo_range->length == 0) - continue; - - if (n_push_ranges >= max_push_buffers) { - memset(ubo_range, 0, sizeof(*ubo_range)); - continue; - } - - assert(ubo_range->block < push_map->block_count); - const struct anv_pipeline_binding *binding = - &push_map->block_to_descriptor[ubo_range->block]; - - map->push_ranges[n_push_ranges++] = (struct anv_push_range) { - .set = binding->set, - .index = binding->index, - .dynamic_offset_index = binding->dynamic_offset_index, - .start = ubo_range->start, - .length = ubo_range->length, - }; - } + map->push_ranges[n_push_ranges++] = *candidate_range; + total_push_regs += candidate_range->length; } /* Pass a single-register push constant payload for the PS stage even if @@ -366,13 +475,44 @@ anv_nir_compute_push_layout(nir_shader *nir, assert(n_push_ranges <= 4); + struct lower_to_push_data_intel_state lower_state = { + .bind_map = map, + .push_map = push_map, + .lowered_ubo_instrs = _mesa_pointer_set_create(NULL), + }; + bool progress = nir_shader_intrinsics_pass( + nir, lower_ubo_to_push_data_intel, + nir_metadata_control_flow, &lower_state); + + if (progress && nir_opt_dce(nir)) { + /* Regather the push data */ + data = gather_push_data(nir, robust_flags, devinfo, push_info, prog_key, + map, lower_state.lowered_ubo_instrs); + + /* Update the ranges */ + struct anv_push_range shrinked_push_constant_range = + compute_final_push_range(devinfo, &data); + assert(shrinked_push_constant_range.length <= push_constant_range.length); + + if (shrinked_push_constant_range.length > 0) { + map->push_ranges[0] = shrinked_push_constant_range; + } else if (map->push_ranges[0].set == shrinked_push_constant_range.set) { + memmove(&map->push_ranges[0], &map->push_ranges[1], 3 * sizeof(map->push_ranges[0])); + memset(&map->push_ranges[3], 0, sizeof(map->push_ranges[3])); + } + + lower_state.reduced_push_ranges = 32 * + (push_constant_range.length - shrinked_push_constant_range.length); + push_constant_range = shrinked_push_constant_range; + } + + /* Finally lower the application's push constants & driver' push data */ + progress |= nir_shader_intrinsics_pass( nir, lower_to_push_data_intel, - nir_metadata_control_flow, - &(struct lower_to_push_data_intel_state) { - .bind_map = map, - .push_map = push_map, - }); + nir_metadata_control_flow, &lower_state); + + ralloc_free(lower_state.lowered_ubo_instrs); /* Do this before calling brw_cs_fill_push_const_info(), it uses the data * in prog_data->push_sizes[]. @@ -390,17 +530,17 @@ anv_nir_compute_push_layout(nir_shader *nir, prog_data->push_sizes[i] = map->push_ranges[i].length * 32; } + unsigned push_start = push_constant_range.start * 32; if (prog_data->robust_ubo_ranges) { const uint32_t push_reg_mask_offset = anv_drv_const_offset(gfx.push_reg_mask[nir->info.stage]); assert(push_reg_mask_offset >= push_start); - prog_data->push_reg_mask_param = - (push_reg_mask_offset - push_start) / 4; + prog_data->push_reg_mask_param = (push_reg_mask_offset - push_start) / 4; } switch (nir->info.stage) { case MESA_SHADER_TESS_CTRL: - if (needs_dyn_tess_config) { + if (data.needs_dyn_tess_config) { struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); const uint32_t tess_config_offset = anv_drv_const_offset(gfx.tess_config); @@ -429,7 +569,7 @@ anv_nir_compute_push_layout(nir_shader *nir, assert(fs_config_offset >= push_start); fs_prog_data->fs_config_param = fs_config_offset - push_start; } - if (needs_wa_18019110168) { + if (data.needs_wa_18019110168) { const uint32_t fs_per_prim_remap_offset = anv_drv_const_offset(gfx.fs_per_prim_remap_offset); assert(fs_per_prim_remap_offset >= push_start); @@ -441,8 +581,8 @@ anv_nir_compute_push_layout(nir_shader *nir, case MESA_SHADER_COMPUTE: { const int subgroup_id_index = - push_end == (anv_drv_const_offset(cs.subgroup_id) + - anv_drv_const_size(cs.subgroup_id)) ? + data.driver_end == (anv_drv_const_offset(cs.subgroup_id) + + anv_drv_const_size(cs.subgroup_id)) ? (anv_drv_const_offset(cs.subgroup_id) - push_start) / 4 : -1; struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); brw_cs_fill_push_const_info(devinfo, cs_prog_data, subgroup_id_index); diff --git a/src/intel/vulkan/anv_nir_lower_resource_intel.c b/src/intel/vulkan/anv_nir_lower_resource_intel.c index 2f9690c230e..d5b0dcf381e 100644 --- a/src/intel/vulkan/anv_nir_lower_resource_intel.c +++ b/src/intel/vulkan/anv_nir_lower_resource_intel.c @@ -36,6 +36,9 @@ update_resource_intel_block(nir_builder *b, nir_intrinsic_instr *intrin, if (intrin->intrinsic != nir_intrinsic_resource_intel) return false; + if (nir_intrinsic_resource_access_intel(intrin) & nir_resource_intel_internal) + return false; + /* If the array index in the descriptor binding is not const, we won't be * able to turn this load_ubo into a push constant. * diff --git a/src/intel/vulkan/anv_nir_push_constants_analysis.c b/src/intel/vulkan/anv_nir_push_constants_analysis.c new file mode 100644 index 00000000000..ac49f65dd2c --- /dev/null +++ b/src/intel/vulkan/anv_nir_push_constants_analysis.c @@ -0,0 +1,336 @@ +/* Copyright © 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "anv_nir.h" +#include "util/u_dynarray.h" + +struct push_range_entry +{ + struct anv_push_range range; + int benefit; +}; + +static int +set_score(uint8_t set) +{ + /* UBO bindings */ + if (set < MAX_SETS) + return 1; + + /* Promotion of descriptor data, higher score than UBOs because of inline + * uniforms or data from the descriptor that can be used for later resource + * access. + */ + switch (set) { + case ANV_DESCRIPTOR_SET_DESCRIPTORS: return 3; + default: UNREACHABLE("unexpected push set"); + } +} + +static int +score(const struct push_range_entry *entry) +{ + return 2 * entry->benefit - entry->range.length; +} + +/** + * Compares score for two UBO range entries. + * + * For a descending qsort(). + */ +static int +cmp_push_range_entry(const void *va, const void *vb) +{ + const struct push_range_entry *a = va; + const struct push_range_entry *b = vb; + + /* Rank based on scores, descending order */ + int delta = score(b) - score(a); + + /* Then use promotion type, descending order */ + if (delta == 0) + delta = set_score(b->range.set) - set_score(a->range.set); + + /* Then use the set index as a tie-breaker, descending order */ + if (delta == 0) + delta = b->range.set - a->range.set; + + /* Then use the UBO block index as a tie-breaker, descending order */ + if (delta == 0) + delta = b->range.index - a->range.index; + + /* Finally use the start offset as a second tie-breaker, ascending order */ + if (delta == 0) + delta = a->range.start - b->range.start; + + return delta; +} + +enum push_block_type { + PUSH_BLOCK_TYPE_UBO = 1, +}; + +struct push_block_key +{ + enum push_block_type type; + uint32_t index; +}; + +struct push_block_info +{ + struct push_block_key key; + + /* Each bit in the offsets bitfield represents a 32-byte section of data. + * If it's set to one, there is interesting UBO data at that offset. If + * not, there's a "hole" - padding between data - or just nothing at all. + */ + uint64_t offsets; + uint8_t uses[64]; +}; + +struct push_analysis_state +{ + const struct intel_device_info *devinfo; + struct hash_table *blocks; +}; + +static uint32_t +push_block_key_hash(const void *key) +{ + return _mesa_hash_data(key, sizeof(struct push_block_key)); +} + +static bool +push_block_key_compare(const void *key1, const void *key2) +{ + return memcmp(key1, key2, sizeof(struct push_block_key)) == 0; +} + +static struct push_block_info * +get_block_info(struct push_analysis_state *state, + enum push_block_type type, uint32_t index) +{ + struct push_block_key key = { .type = type, .index = index, }; + struct hash_entry *entry = + _mesa_hash_table_search(state->blocks, &key); + if (entry) + return (struct push_block_info *) entry->data; + + struct push_block_info *info = + rzalloc(state->blocks, struct push_block_info); + info->key = key; + _mesa_hash_table_insert(state->blocks, &info->key, info); + + return info; +} + +static void +maybe_add_pushable_ubo(struct push_analysis_state *state, + nir_intrinsic_instr *intrin) +{ + const int block = anv_nir_get_ubo_binding_push_block(intrin->src[0]); + const unsigned byte_offset = nir_src_as_uint(intrin->src[1]); + const int offset = byte_offset / state->devinfo->grf_size; + + /* Avoid shifting by larger than the width of our bitfield, as this + * is undefined in C. Even if we require multiple bits to represent + * the entire value, it's OK to record a partial value - the backend + * is capable of falling back to pull loads for later components of + * vectors, as it has to shrink ranges for other reasons anyway. + */ + if (offset >= 64) + return; + + /* The value might span multiple GRFs. */ + const unsigned num_components = + nir_def_last_component_read(&intrin->def) + 1; + const int bytes = num_components * (intrin->def.bit_size / 8); + const int start = ROUND_DOWN_TO(byte_offset, state->devinfo->grf_size); + const int end = align(byte_offset + bytes, state->devinfo->grf_size); + const int chunks = (end - start) / state->devinfo->grf_size; + + /* TODO: should we count uses in loops as higher benefit? */ + + struct push_block_info *info = + get_block_info(state, PUSH_BLOCK_TYPE_UBO, block); + info->offsets |= ((1ull << chunks) - 1) << offset; + info->uses[offset]++; +} + +static void +analyze_pushable_block(struct push_analysis_state *state, nir_block *block) +{ + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_ubo: + if (anv_nir_is_promotable_ubo_binding(intrin->src[0]) && + nir_src_is_const(intrin->src[1])) + maybe_add_pushable_ubo(state, intrin); + break; + + default: + break; + } + } +} + +static void +print_push_entry(FILE *file, + const struct push_block_info *info, + const struct push_range_entry *entry, + struct push_analysis_state *state) +{ + fprintf(file, + "set %2d, index %2d, start %2d, length %2d, bits = %"PRIx64", " + "benefit %2d, cost %2d, score = %2d\n", + entry->range.set, entry->range.index, + entry->range.start, entry->range.length, + info ? info->offsets : 0ul, entry->benefit, entry->range.length, score(entry)); +} + +void +anv_nir_analyze_push_constants_ranges(nir_shader *nir, + const struct intel_device_info *devinfo, + const struct anv_pipeline_push_map *push_map, + struct anv_push_range out_ranges[4]) +{ + void *mem_ctx = ralloc_context(NULL); + + struct push_analysis_state state = { + .devinfo = devinfo, + .blocks = _mesa_hash_table_create(mem_ctx, + push_block_key_hash, + push_block_key_compare), + }; + + /* Walk the IR, recording how many times each UBO block/offset is used. */ + nir_foreach_function_impl(impl, nir) { + nir_foreach_block(block, impl) { + analyze_pushable_block(&state, block); + } + } + + /* Find ranges: a block, starting register-size aligned byte offset, and + * length. + */ + struct util_dynarray ranges; + util_dynarray_init(&ranges, mem_ctx); + + hash_table_foreach(state.blocks, entry) { + const struct push_block_info *info = entry->data; + uint64_t offsets = info->offsets; + + /* Walk through the offsets bitfield, finding contiguous regions of + * set bits: + * + * 0000000001111111111111000000000000111111111111110000000011111100 + * ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^ + * + * Each of these will become a UBO range. + */ + while (offsets != 0) { + /* Find the first 1 in the offsets bitfield. This represents the + * start of a range of interesting UBO data. Make it zero-indexed. + */ + int first_bit = ffsll(offsets) - 1; + + /* Find the first 0 bit in offsets beyond first_bit. To find the + * first zero bit, we find the first 1 bit in the complement. In + * order to ignore bits before first_bit, we mask off those bits. + */ + int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1; + + if (first_hole == -1) { + /* If we didn't find a hole, then set it to the end of the + * bitfield. There are no more ranges to process. + */ + first_hole = 64; + offsets = 0; + } else { + /* We've processed all bits before first_hole. Mask them off. */ + offsets &= ~((1ull << first_hole) - 1); + } + + struct push_range_entry *entry = + util_dynarray_grow(&ranges, struct push_range_entry, 1); + + assert(info->key.index < push_map->block_count); + const struct anv_pipeline_binding *binding = + &push_map->block_to_descriptor[info->key.index]; + entry->range.set = binding->set; + entry->range.index = binding->index; + entry->range.dynamic_offset_index = binding->dynamic_offset_index; + entry->range.start = first_bit; + /* first_hole is one beyond the end, so we don't need to add 1 */ + entry->range.length = first_hole - first_bit; + entry->benefit = 0; + + for (int i = 0; i < entry->range.length; i++) + entry->benefit += info->uses[first_bit + i]; + + if (false) + print_push_entry(stderr, info, entry, &state); + } + } + + /* TODO: Consider combining ranges. + * + * We can only push 4 ranges via 3DSTATE_CONSTANT_XS. If there are + * more ranges, and two are close by with only a small hole, it may be + * worth combining them. The holes will waste register space, but the + * benefit of removing pulls may outweigh that cost. + */ + + /* Sort the list so the most beneficial ranges are at the front. */ + int nr_entries = ranges.size / sizeof(struct push_range_entry); + if (nr_entries > 0) { + qsort(ranges.data, nr_entries, sizeof(struct push_range_entry), + cmp_push_range_entry); + } + + if (false) { + util_dynarray_foreach(&ranges, struct push_range_entry, entry) { + print_push_entry(stderr, NULL, entry, &state); + } + } + + struct push_range_entry *entries = ranges.data; + + for (unsigned i = 0; i < nr_entries; i++) { + entries[i].range.start *= devinfo->grf_size / 32; + entries[i].range.length *= devinfo->grf_size / 32; + } + + /* Return the top 4, limited to the maximum number of push registers. + * + * The Vulkan driver sets up additional non-UBO push constants, so it may + * need to shrink these ranges further (see anv_nir_compute_push_layout.c). + * The OpenGL driver treats legacy uniforms as a UBO, so this is enough. + * + * To limit further, simply drop the tail of the list, as that's the least + * valuable portion. + */ + const int max_ubos = 4; + nr_entries = MIN2(nr_entries, max_ubos); + + const unsigned max_push = 64; + unsigned total_push = 0; + + for (unsigned i = 0; i < nr_entries; i++) { + if (total_push + entries[i].range.length > max_push) + entries[i].range.length = max_push - total_push; + total_push += entries[i].range.length; + } + + for (int i = 0; i < nr_entries; i++) + out_ranges[i] = entries[i].range; + for (int i = nr_entries; i < 4; i++) + out_ranges[i] = (struct anv_push_range) {}; + + ralloc_free(ranges.mem_ctx); +} diff --git a/src/intel/vulkan/anv_nir_push_descriptor_analysis.c b/src/intel/vulkan/anv_nir_push_descriptor_analysis.c index a14441f07b9..243d8b00a84 100644 --- a/src/intel/vulkan/anv_nir_push_descriptor_analysis.c +++ b/src/intel/vulkan/anv_nir_push_descriptor_analysis.c @@ -203,6 +203,10 @@ anv_nir_push_desc_ubo_fully_promoted(nir_shader *nir, if (nir_intrinsic_desc_set(resource) != push_set) continue; + /* Skip load_ubo loading the descriptor buffer (not a binding) */ + if (nir_intrinsic_binding(resource) == UINT32_MAX) + continue; + uint32_t binding = nir_intrinsic_binding(resource); /* If we have indirect indexing in the binding, no push promotion diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index acb4c6ef710..be480fa5331 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2687,6 +2687,10 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, break; case ANV_DESCRIPTOR_SET_DESCRIPTORS: + /* We have LSC_SS surface states for this, binding table isn't + * needed. + */ + assert(!cmd_buffer->device->info->has_lsc); if (shader->bind_map.layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER) { assert(pipe_state->descriptor_buffers[binding->index].state.alloc_size); bt_map[s] = pipe_state->descriptor_buffers[binding->index].state.offset + diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index 7e5e7714494..92ac57b2305 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -181,6 +181,7 @@ libanv_files = files( 'anv_nir_lower_ubo_loads.c', 'anv_nir_lower_resource_intel.c', 'anv_nir_lower_unaligned_dispatch.c', + 'anv_nir_push_constants_analysis.c', 'anv_nir_push_descriptor_analysis.c', 'anv_perf.c', 'anv_physical_device.c',