anv: use internal surface state on Gfx12.5+ to access descriptor buffers

As a result on Gfx12.5+ we're not holding any binding table entry to
access descriptor buffers.

This should reduce the amount of binding table allocations.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10711
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35160>
This commit is contained in:
Lionel Landwerlin 2025-05-22 15:43:47 +03:00 committed by Marge Bot
parent 87abf57764
commit e94cb92cb0
9 changed files with 865 additions and 299 deletions

View file

@ -649,24 +649,10 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
cmd_buffer->state.descriptors_dirty |= stages;
cmd_buffer->state.descriptor_buffers.offsets_dirty |= stages;
} else {
/* When using indirect descriptors, stages that have access to the HW
* binding tables, never need to access the
* anv_push_constants::desc_offsets fields, because any data they
* need from the descriptor buffer is accessible through a binding
* table entry. For stages that are "bindless" (Mesh/Task/RT), we
* need to provide anv_push_constants::desc_offsets matching the
* bound descriptor so that shaders can access the descriptor buffer
* through A64 messages.
*
* With direct descriptors, the shaders can use the
* anv_push_constants::desc_offsets to build bindless offsets. So
* it's we always need to update the push constant data.
/* Plaforms with LSC will use descriptor buffer push constant
* offsets
*/
bool update_desc_sets =
!cmd_buffer->device->physical->indirect_descriptors ||
(stages & (VK_SHADER_STAGE_TASK_BIT_EXT |
VK_SHADER_STAGE_MESH_BIT_EXT |
ANV_RT_STAGE_BITS));
bool update_desc_sets = cmd_buffer->device->info->has_lsc;
if (update_desc_sets) {
struct anv_push_constants *push = &pipe_state->push_constants;
@ -679,14 +665,15 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
push->desc_sampler_offsets[set_index] =
anv_address_physical(set->desc_sampler_addr) -
cmd_buffer->device->physical->va.dynamic_state_pool.addr;
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
set->desc_surface_addr.bo);
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
set->desc_sampler_addr.bo);
}
}
/* Always add a reference to the buffers */
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
set->desc_surface_addr.bo);
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
set->desc_sampler_addr.bo);
dirty_stages |= stages;
}

View file

@ -133,6 +133,9 @@ void anv_nir_validate_push_layout(const struct anv_physical_device *pdevice,
bool anv_nir_update_resource_intel_block(nir_shader *shader);
bool anv_nir_lower_desc_address(nir_shader *shader,
const struct anv_pipeline_bind_map *map);
bool anv_nir_lower_unaligned_dispatch(nir_shader *shader);
bool anv_nir_lower_resource_intel(nir_shader *shader,
@ -159,6 +162,40 @@ void anv_apply_per_prim_attr_wa(struct nir_shader *ms_nir,
struct nir_shader *fs_nir,
struct anv_device *device);
static inline bool
anv_nir_is_promotable_ubo_binding(nir_src src)
{
nir_intrinsic_instr *intrin = nir_src_as_intrinsic(src);
return intrin && intrin->intrinsic == nir_intrinsic_resource_intel &&
(nir_intrinsic_resource_access_intel(intrin) &
nir_resource_intel_pushable);
}
static inline bool
anv_nir_is_internal_ubo(nir_src src)
{
nir_intrinsic_instr *intrin = nir_src_as_intrinsic(src);
return intrin && intrin->intrinsic == nir_intrinsic_resource_intel &&
(nir_intrinsic_resource_access_intel(intrin) &
nir_resource_intel_internal);
}
static inline unsigned
anv_nir_get_ubo_binding_push_block(nir_src src)
{
nir_intrinsic_instr *intrin = nir_src_as_intrinsic(src);
assert(intrin && intrin->intrinsic == nir_intrinsic_resource_intel);
return nir_intrinsic_resource_block_intel(intrin);
}
void anv_nir_analyze_push_constants_ranges(nir_shader *nir,
const struct intel_device_info *devinfo,
const struct anv_pipeline_push_map *push_map,
struct anv_push_range out_ranges[4]);
#ifdef __cplusplus
}
#endif

View file

@ -72,6 +72,7 @@ struct apply_pipeline_layout_state {
struct {
bool desc_buffer_used;
uint8_t desc_offset;
uint32_t push_block;
struct anv_binding_apply_layout {
uint8_t use_count;
@ -389,35 +390,17 @@ build_load_descriptor_mem(nir_builder *b,
const struct apply_pipeline_layout_state *state)
{
switch (state->desc_addr_format) {
case nir_address_format_64bit_global_32bit_offset: {
nir_def *base_addr =
nir_pack_64_2x32(b, nir_trim_vector(b, desc_addr, 2));
nir_def *offset32 =
nir_iadd_imm(b, nir_channel(b, desc_addr, 3), desc_offset);
assert(state->desc_addr_format == nir_address_format_32bit_index_offset);
return nir_load_global_constant_offset(b, num_components, bit_size,
base_addr, offset32,
.align_mul = 8,
.align_offset = desc_offset % 8);
}
nir_def *surface_index = nir_channel(b, desc_addr, 0);
nir_def *offset32 = nir_iadd_imm(b, nir_channel(b, desc_addr, 1), desc_offset);
case nir_address_format_32bit_index_offset: {
nir_def *surface_index = nir_channel(b, desc_addr, 0);
nir_def *offset32 =
nir_iadd_imm(b, nir_channel(b, desc_addr, 1), desc_offset);
return nir_load_ubo(b, num_components, bit_size,
surface_index, offset32,
.align_mul = 8,
.align_offset = desc_offset % 8,
.range_base = 0,
.range = num_components * bit_size / 8);
}
default:
UNREACHABLE("Unsupported address format");
}
return nir_load_ubo(b, num_components, bit_size,
surface_index, offset32,
.align_mul = 8,
.align_offset = desc_offset % 8,
.range_base = 0,
.range = num_components * bit_size / 8);
}
/* When using direct descriptor, we do not have a structure to read in memory
@ -635,11 +618,14 @@ build_desc_address64(nir_builder *b, nir_def *set_idx, unsigned set_idx_imm,
/** Build a 32bit_index_offset address for a descriptor set */
static nir_def *
build_desc_address32(nir_builder *b,
nir_def *set_idx, nir_def *offset,
nir_def *set_idx, unsigned set,
nir_def *offset,
const struct apply_pipeline_layout_state *state)
{
return nir_vec2(b,
nir_vector_extract(b, state->set_idx_to_bti, set_idx),
nir_vector_extract(
b, state->set_idx_to_bti,
set < MAX_SETS ? nir_imm_int(b, set) : set_idx),
offset);
}
@ -820,7 +806,7 @@ build_desc_addr_for_res_index(nir_builder *b,
}
case nir_address_format_32bit_index_offset:
return build_desc_address32(b, res.set_idx, desc_offset, state);
return build_desc_address32(b, res.set_idx, UINT32_MAX, desc_offset, state);
default:
UNREACHABLE("Unhandled address format");
@ -830,7 +816,7 @@ build_desc_addr_for_res_index(nir_builder *b,
case nir_address_format_32bit_index_offset:
assert(desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
assert(state->desc_addr_format == nir_address_format_32bit_index_offset);
return build_desc_address32(b, res.set_idx, desc_offset, state);
return build_desc_address32(b, res.set_idx, UINT32_MAX, desc_offset, state);
default:
UNREACHABLE("Unhandled address format");
@ -878,7 +864,7 @@ build_desc_addr_for_binding(nir_builder *b,
desc_offset = nir_iadd_imm(
b, desc_offset, plane * bind_layout->descriptor_data_surface_size);
}
return build_desc_address32(b, nir_imm_int(b, set), desc_offset, state);
return build_desc_address32(b, NULL, set, desc_offset, state);
}
default:
@ -1245,10 +1231,10 @@ build_buffer_addr_for_binding(nir_builder *b,
if (addr_format != nir_address_format_32bit_index_offset)
return build_buffer_addr_for_res_index(b, desc_type, res_index, addr_format, state);
if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
const struct anv_descriptor_set_binding_layout *bind_layout =
&state->set_layouts[set]->binding[binding];
return build_desc_address32(b, nir_imm_int(b, set),
const struct anv_descriptor_set_binding_layout *bind_layout =
&state->set_layouts[set]->binding[binding];
if (bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
return build_desc_address32(b, NULL, set,
nir_imm_int(b, bind_layout->descriptor_surface_offset),
state);
}
@ -1426,9 +1412,6 @@ try_lower_direct_buffer_intrinsic(nir_builder *b,
if (state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT &&
!descriptor_has_bti(desc, state))
return false;
/* Rewrite to 32bit_index_offset whenever we can */
addr_format = nir_address_format_32bit_index_offset;
} else {
assert(nir_deref_mode_is(deref, nir_var_mem_ubo));
@ -1444,15 +1427,11 @@ try_lower_direct_buffer_intrinsic(nir_builder *b,
bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK &&
!descriptor_has_bti(desc, state))
return false;
/* If this is an inline uniform and the shader stage is bindless, we
* can't switch to 32bit_index_offset.
*/
if (bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ||
!brw_shader_stage_requires_bindless_resources(b->shader->info.stage))
addr_format = nir_address_format_32bit_index_offset;
}
/* Rewrite to 32bit_index_offset whenever we can */
addr_format = nir_address_format_32bit_index_offset;
/* If a dynamic has not been assigned a binding table entry, we need to
* bail here.
*/
@ -2072,7 +2051,7 @@ binding_is_promotable_to_push(const struct anv_descriptor_set_layout *set_layout
return (bind_layout->flags & non_pushable_binding_flags) == 0;
}
static void
static uint32_t
add_null_bti_entry(struct anv_pipeline_bind_map *map)
{
map->surface_to_descriptor[map->surface_count++] =
@ -2080,9 +2059,25 @@ add_null_bti_entry(struct anv_pipeline_bind_map *map)
.set = ANV_DESCRIPTOR_SET_NULL,
};
assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
return map->surface_count - 1;
}
static void
static uint32_t
add_desc_bti_entry(struct anv_pipeline_bind_map *map,
uint32_t set)
{
map->surface_to_descriptor[map->surface_count++] =
(struct anv_pipeline_binding) {
.set = ANV_DESCRIPTOR_SET_DESCRIPTORS,
.binding = UINT32_MAX,
.index = set,
};
assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
return map->surface_count - 1;
}
static uint32_t
add_bti_entry(struct anv_pipeline_bind_map *map,
uint32_t set,
uint32_t binding,
@ -2101,9 +2096,11 @@ add_bti_entry(struct anv_pipeline_bind_map *map,
.plane = plane,
};
assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
return map->surface_count - 1;
}
static void
static uint32_t
add_dynamic_bti_entry(struct anv_pipeline_bind_map *map,
uint32_t set,
uint32_t binding,
@ -2120,6 +2117,8 @@ add_dynamic_bti_entry(struct anv_pipeline_bind_map *map,
.dynamic_offset_index = bind_layout->dynamic_offset_index + element,
};
assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
return map->surface_count - 1;
}
static void
@ -2139,6 +2138,19 @@ add_sampler_entry(struct anv_pipeline_bind_map *map,
};
}
static void
add_descriptor_push_entry(struct anv_pipeline_push_map *push_map,
uint32_t set,
struct anv_pipeline_bind_map *map)
{
push_map->block_to_descriptor[push_map->block_count++] =
(struct anv_pipeline_binding) {
.set = ANV_DESCRIPTOR_SET_DESCRIPTORS,
.binding = UINT32_MAX,
.index = set,
};
}
static void
add_push_entry(struct anv_pipeline_push_map *push_map,
uint32_t set,
@ -2218,7 +2230,7 @@ build_packed_binding_table(struct apply_pipeline_layout_state *state,
void *push_map_mem_ctx)
{
/* Compute the amount of push block items required. */
unsigned push_block_count = 0;
unsigned push_block_count = map->surface_count + MAX_SETS;
for (unsigned s = 0; s < state->set_count; s++) {
const struct anv_descriptor_set_layout *set_layout =
state->set_layouts[s];
@ -2231,31 +2243,37 @@ build_packed_binding_table(struct apply_pipeline_layout_state *state,
}
}
/* Assign a BTI to each used descriptor set */
for (unsigned s = 0; s < state->set_count; s++) {
if (state->desc_addr_format != nir_address_format_32bit_index_offset) {
state->set[s].desc_offset = BINDLESS_OFFSET;
} else if (state->set[s].desc_buffer_used) {
map->surface_to_descriptor[map->surface_count] =
(struct anv_pipeline_binding) {
.set = ANV_DESCRIPTOR_SET_DESCRIPTORS,
.binding = UINT32_MAX,
.index = s,
};
state->set[s].desc_offset = map->surface_count++;
}
}
/* Assign a block index for each surface */
push_map->block_to_descriptor =
rzalloc_array(push_map_mem_ctx, struct anv_pipeline_binding,
map->surface_count + push_block_count);
push_map->block_to_descriptor = rzalloc_array(push_map_mem_ctx,
struct anv_pipeline_binding,
push_block_count);
memcpy(push_map->block_to_descriptor,
map->surface_to_descriptor,
sizeof(push_map->block_to_descriptor[0]) * map->surface_count);
push_map->block_count = map->surface_count;
/* Assign a BTI to each used descriptor set */
for (unsigned s = 0; s < state->set_count; s++) {
if (state->set[s].desc_buffer_used) {
/* Only add a binding table entry on platform that cannot use
* LSC_ADDR_SURFTYPE_SS.
*/
if (!state->pdevice->info.has_lsc)
state->set[s].desc_offset = add_desc_bti_entry(map, s);
if (brw_shader_stage_requires_bindless_resources(shader->info.stage)) {
state->set[s].push_block = UINT32_MAX;
} else {
state->set[s].push_block = push_map->block_count;
add_descriptor_push_entry(push_map, s, state->bind_map);
}
} else {
state->set[s].desc_offset = BINDLESS_OFFSET;
state->set[s].push_block = UINT32_MAX;
}
}
/* Count used bindings, assign embedded sampler indices & add push blocks
* for promotion to push constants
*/
@ -2428,19 +2446,59 @@ build_packed_binding_table(struct apply_pipeline_layout_state *state,
}
static nir_def *
build_descriptor_bti_vec(nir_builder *b,
build_descriptor_set_bti(nir_builder *b,
uint32_t set,
const struct apply_pipeline_layout_state *state)
{
if (state->pdevice->info.has_lsc) {
nir_def *surface_handle =
nir_load_reloc_const_intel(
b,
state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER ?
BRW_SHADER_RELOC_DESCRIPTORS_BUFFERS_VIEW_HANDLE :
BRW_SHADER_RELOC_DESCRIPTORS_VIEW_HANDLE);
return nir_resource_intel(
b,
nir_imm_int(b, set),
surface_handle,
nir_iand_imm(b,
anv_load_driver_uniform(b, 1, desc_surface_offsets[set]),
ANV_DESCRIPTOR_SET_OFFSET_MASK) /* array_index */,
nir_imm_int(b, 0) /* bindless_base_offset */,
.desc_set = set,
.binding = -1,
.resource_block_intel = state->set[set].push_block,
.resource_access_intel = nir_resource_intel_pushable |
nir_resource_intel_internal);
} else {
return nir_resource_intel(
b,
nir_imm_int(b, set),
nir_imm_int(b, state->set[set].desc_offset),
nir_imm_int(b, 0) /* array_index */,
nir_imm_int(b, 0) /* bindless_base_offset */,
.desc_set = set,
.binding = -1,
.resource_block_intel = state->set[set].desc_offset,
.resource_access_intel = nir_resource_intel_pushable);
}
}
static nir_def *
build_descriptor_sets_bti_vec(nir_builder *b,
const struct apply_pipeline_layout_state *state)
{
STATIC_ASSERT(MAX_SETS == 8);
return nir_vec8(b,
nir_imm_int(b, state->set[0].desc_offset),
nir_imm_int(b, state->set[1].desc_offset),
nir_imm_int(b, state->set[2].desc_offset),
nir_imm_int(b, state->set[3].desc_offset),
nir_imm_int(b, state->set[4].desc_offset),
nir_imm_int(b, state->set[5].desc_offset),
nir_imm_int(b, state->set[6].desc_offset),
nir_imm_int(b, state->set[7].desc_offset));
build_descriptor_set_bti(b, 0, state),
build_descriptor_set_bti(b, 1, state),
build_descriptor_set_bti(b, 2, state),
build_descriptor_set_bti(b, 3, state),
build_descriptor_set_bti(b, 4, state),
build_descriptor_set_bti(b, 5, state),
build_descriptor_set_bti(b, 6, state),
build_descriptor_set_bti(b, 7, state));
}
bool
@ -2462,8 +2520,6 @@ anv_nir_apply_pipeline_layout(nir_shader *shader,
anv_validate_pipeline_layout(set_layouts, set_count, shader);
#endif
const bool bindless_stage =
brw_shader_stage_requires_bindless_resources(shader->info.stage);
struct apply_pipeline_layout_state state = {
.mem_ctx = ralloc_context(NULL),
.pdevice = pdevice,
@ -2471,9 +2527,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader,
.set_layouts = set_layouts,
.set_count = set_count,
.dynamic_offset_start = dynamic_offset_start,
.desc_addr_format = bindless_stage ?
nir_address_format_64bit_global_32bit_offset :
nir_address_format_32bit_index_offset,
.desc_addr_format = nir_address_format_32bit_index_offset,
.ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_flags),
.ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_flags),
};
@ -2529,7 +2583,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader,
*/
nir_foreach_function_impl(impl, shader) {
nir_builder _b = nir_builder_at(nir_before_impl(impl)), *b = &_b;
state.set_idx_to_bti = build_descriptor_bti_vec(b, &state);
state.set_idx_to_bti = build_descriptor_sets_bti_vec(b, &state);
progress |= nir_function_instructions_pass(impl,
lower_direct_buffer_instr,
nir_metadata_control_flow,
@ -2543,7 +2597,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader,
nir_foreach_function_impl(impl, shader) {
nir_builder _b = nir_builder_at(nir_before_impl(impl)), *b = &_b;
state.set_idx_to_bti = build_descriptor_bti_vec(b, &state);
state.set_idx_to_bti = build_descriptor_sets_bti_vec(b, &state);
progress |= nir_function_instructions_pass(impl,
apply_pipeline_layout,
nir_metadata_control_flow,

View file

@ -25,111 +25,90 @@
#include "nir_builder.h"
#include "compiler/brw/brw_nir.h"
#include "util/mesa-sha1.h"
#include "util/set.h"
struct lower_to_push_data_intel_state {
const struct anv_pipeline_bind_map *bind_map;
const struct anv_pipeline_push_map *push_map;
struct push_data {
bool push_ubo_ranges;
bool needs_wa_18019110168;
bool needs_dyn_tess_config;
unsigned app_start, app_end;
unsigned driver_start, driver_end;
};
static bool
lower_to_push_data_intel(nir_builder *b,
nir_intrinsic_instr *intrin,
void *data)
static void
adjust_driver_push_values(nir_shader *nir,
enum brw_robustness_flags robust_flags,
const struct anv_nir_push_layout_info *push_info,
struct brw_base_prog_key *prog_key,
const struct intel_device_info *devinfo,
struct push_data *data)
{
const struct lower_to_push_data_intel_state *state = data;
/* With bindless shaders we load uniforms with SEND messages. All the push
* constants are located after the RT_DISPATCH_GLOBALS. We just need to add
* the offset to the address right after RT_DISPATCH_GLOBALS (see
* brw_nir_lower_rt_intrinsics.c).
*/
const unsigned base_offset =
brw_shader_stage_is_bindless(b->shader->info.stage) ?
0 : state->bind_map->push_ranges[0].start * 32;
switch (intrin->intrinsic) {
case nir_intrinsic_load_push_data_intel: {
nir_intrinsic_set_base(intrin, nir_intrinsic_base(intrin) - base_offset);
return true;
if (data->push_ubo_ranges && (robust_flags & BRW_ROBUSTNESS_UBO)) {
/* We can't on-the-fly adjust our push ranges because doing so would
* mess up the layout in the shader. When robustBufferAccess is
* enabled, we push a mask into the shader indicating which pushed
* registers are valid and we zero out the invalid ones at the top of
* the shader.
*/
const uint32_t push_reg_mask_start =
anv_drv_const_offset(gfx.push_reg_mask[nir->info.stage]);
const uint32_t push_reg_mask_end =
push_reg_mask_start +
anv_drv_const_size(gfx.push_reg_mask[nir->info.stage]);
data->driver_start = MIN2(data->driver_start, push_reg_mask_start);
data->driver_end = MAX2(data->driver_end, push_reg_mask_end);
}
case nir_intrinsic_load_push_constant: {
b->cursor = nir_before_instr(&intrin->instr);
nir_def *data = nir_load_push_data_intel(
b,
intrin->def.num_components,
intrin->def.bit_size,
intrin->src[0].ssa,
.base = nir_intrinsic_base(intrin) - base_offset,
.range = nir_intrinsic_range(intrin));
nir_def_replace(&intrin->def, data);
return true;
}
case nir_intrinsic_load_ubo: {
if (!brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) ||
!nir_src_is_const(intrin->src[1]))
return false;
const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]);
const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
const unsigned num_components =
nir_def_last_component_read(&intrin->def) + 1;
const int bytes = num_components * (intrin->def.bit_size / 8);
const struct anv_pipeline_binding *binding =
&state->push_map->block_to_descriptor[block];
uint32_t range_offset = 0;
const struct anv_push_range *push_range = NULL;
for (uint32_t i = 0; i < 4; i++) {
if (state->bind_map->push_ranges[i].set == binding->set &&
state->bind_map->push_ranges[i].index == binding->index &&
byte_offset >= state->bind_map->push_ranges[i].start * 32 &&
(byte_offset + bytes) <= (state->bind_map->push_ranges[i].start +
state->bind_map->push_ranges[i].length) * 32) {
push_range = &state->bind_map->push_ranges[i];
break;
} else {
range_offset += state->bind_map->push_ranges[i].length * 32;
}
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
if (push_info->fragment_dynamic) {
const uint32_t fs_config_start = anv_drv_const_offset(gfx.fs_config);
const uint32_t fs_config_end = fs_config_start +
anv_drv_const_size(gfx.fs_config);
data->driver_start = MIN2(data->driver_start, fs_config_start);
data->driver_end = MAX2(data->driver_end, fs_config_end);
}
if (push_range == NULL)
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def *data = nir_load_push_data_intel(
b,
nir_def_last_component_read(&intrin->def) + 1,
intrin->def.bit_size,
nir_imm_int(b, 0),
.base = range_offset + byte_offset - push_range->start * 32,
.range = nir_intrinsic_range(intrin));
nir_def_replace(&intrin->def, data);
return true;
if (data->needs_wa_18019110168) {
const uint32_t fs_per_prim_remap_start =
anv_drv_const_offset(gfx.fs_per_prim_remap_offset);
const uint32_t fs_per_prim_remap_end =
fs_per_prim_remap_start +
anv_drv_const_size(gfx.fs_per_prim_remap_offset);
data->driver_start = MIN2(data->driver_start, fs_per_prim_remap_start);
data->driver_end = MAX2(data->driver_end, fs_per_prim_remap_end);
}
}
default:
return false;
data->needs_dyn_tess_config =
(nir->info.stage == MESA_SHADER_TESS_CTRL &&
(container_of(prog_key, struct brw_tcs_prog_key, base)->input_vertices == 0 ||
push_info->separate_tessellation)) ||
(nir->info.stage == MESA_SHADER_TESS_EVAL &&
push_info->separate_tessellation);
if (data->needs_dyn_tess_config) {
const uint32_t tess_config_start = anv_drv_const_offset(gfx.tess_config);
const uint32_t tess_config_end = tess_config_start +
anv_drv_const_size(gfx.tess_config);
data->driver_start = MIN2(data->driver_start, tess_config_start);
data->driver_end = MAX2(data->driver_end, tess_config_end);
}
}
bool
anv_nir_compute_push_layout(nir_shader *nir,
const struct anv_physical_device *pdevice,
enum brw_robustness_flags robust_flags,
const struct anv_nir_push_layout_info *push_info,
struct brw_base_prog_key *prog_key,
struct brw_stage_prog_data *prog_data,
struct anv_pipeline_bind_map *map,
const struct anv_pipeline_push_map *push_map)
static struct push_data
gather_push_data(nir_shader *nir,
enum brw_robustness_flags robust_flags,
const struct intel_device_info *devinfo,
const struct anv_nir_push_layout_info *push_info,
struct brw_base_prog_key *prog_key,
struct anv_pipeline_bind_map *map,
struct set *lowered_ubo_instrs)
{
const struct brw_compiler *compiler = pdevice->compiler;
const struct intel_device_info *devinfo = compiler->devinfo;
memset(map->push_ranges, 0, sizeof(map->push_ranges));
bool has_const_ubo = false;
unsigned push_start = UINT_MAX, push_end = 0;
struct push_data data = {
.app_start = UINT_MAX, .app_end = 0,
.driver_start = UINT_MAX, .driver_end = 0,
};
nir_foreach_function_impl(impl, nir) {
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
@ -144,12 +123,26 @@ anv_nir_compute_push_layout(nir_shader *nir,
has_const_ubo = true;
break;
case nir_intrinsic_load_push_constant:
case nir_intrinsic_load_push_data_intel: {
case nir_intrinsic_load_push_constant: {
unsigned base = nir_intrinsic_base(intrin);
unsigned range = nir_intrinsic_range(intrin);
push_start = MIN2(push_start, base);
push_end = MAX2(push_end, base + range);
data.app_start = MIN2(data.app_start, base);
data.app_end = MAX2(data.app_end, base + range);
break;
}
case nir_intrinsic_load_push_data_intel: {
if (lowered_ubo_instrs &&
_mesa_set_search(lowered_ubo_instrs, intrin)) {
has_const_ubo = true;
break;
}
unsigned base = nir_intrinsic_base(intrin);
unsigned range = nir_intrinsic_range(intrin);
data.driver_start = MIN2(data.driver_start, base);
data.driver_end = MAX2(data.driver_end, base + range);
/* We need to retain this information to update the push
* constant on vkCmdDispatch*().
*/
@ -167,67 +160,161 @@ anv_nir_compute_push_layout(nir_shader *nir,
}
}
const bool push_ubo_ranges =
data.push_ubo_ranges =
has_const_ubo && nir->info.stage != MESA_SHADER_COMPUTE &&
!brw_shader_stage_requires_bindless_resources(nir->info.stage);
const bool needs_wa_18019110168 =
data.needs_wa_18019110168 =
nir->info.stage == MESA_SHADER_FRAGMENT &&
brw_nir_fragment_shader_needs_wa_18019110168(
devinfo, push_info->mesh_dynamic ? INTEL_SOMETIMES : INTEL_NEVER, nir);
if (push_ubo_ranges && (robust_flags & BRW_ROBUSTNESS_UBO)) {
/* We can't on-the-fly adjust our push ranges because doing so would
* mess up the layout in the shader. When robustBufferAccess is
* enabled, we push a mask into the shader indicating which pushed
* registers are valid and we zero out the invalid ones at the top of
* the shader.
adjust_driver_push_values(nir, robust_flags, push_info,
prog_key, devinfo, &data);
return data;
}
struct lower_to_push_data_intel_state {
const struct anv_pipeline_bind_map *bind_map;
const struct anv_pipeline_push_map *push_map;
struct set *lowered_ubo_instrs;
/* Amount that should be subtracted to UBOs loads converted to
* push_data_intel (in lowered_ubo_instrs)
*/
unsigned reduced_push_ranges;
};
/* Lower internal UBOs, only used for descriptor buffer loads when the offset
* is dynamic. We need to add the base offset of the descriptor buffer to the
* offset relative to the descriptor set.
*/
static bool
lower_internal_ubo(nir_builder *b,
nir_intrinsic_instr *intrin)
{
if (!anv_nir_is_internal_ubo(intrin->src[0]))
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_intrinsic_instr *resource = nir_src_as_intrinsic(intrin->src[0]);
/* Add the descriptor offset from the resource array_index source to the
* relative offset.
*/
nir_src_rewrite(&intrin->src[1],
nir_iadd(b, resource->src[2].ssa, intrin->src[1].ssa));
return true;
}
static bool
lower_ubo_to_push_data_intel(nir_builder *b,
nir_intrinsic_instr *intrin,
void *_data)
{
if (intrin->intrinsic != nir_intrinsic_load_ubo)
return false;
if (!anv_nir_is_promotable_ubo_binding(intrin->src[0]) ||
!nir_src_is_const(intrin->src[1]) ||
brw_shader_stage_requires_bindless_resources(b->shader->info.stage))
return lower_internal_ubo(b, intrin);
const struct lower_to_push_data_intel_state *state = _data;
const int block = anv_nir_get_ubo_binding_push_block(intrin->src[0]);
assert(block < state->push_map->block_count);
const struct anv_pipeline_binding *binding =
&state->push_map->block_to_descriptor[block];
const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
const unsigned num_components =
nir_def_last_component_read(&intrin->def) + 1;
const int bytes = num_components * (intrin->def.bit_size / 8);
uint32_t range_offset = 0;
const struct anv_push_range *push_range = NULL;
for (uint32_t i = 0; i < 4; i++) {
if (state->bind_map->push_ranges[i].set == binding->set &&
state->bind_map->push_ranges[i].index == binding->index &&
byte_offset >= state->bind_map->push_ranges[i].start * 32 &&
(byte_offset + bytes) <= (state->bind_map->push_ranges[i].start +
state->bind_map->push_ranges[i].length) * 32) {
push_range = &state->bind_map->push_ranges[i];
break;
} else {
range_offset += state->bind_map->push_ranges[i].length * 32;
}
}
if (push_range == NULL)
return lower_internal_ubo(b, intrin);
b->cursor = nir_before_instr(&intrin->instr);
nir_def *data = nir_load_push_data_intel(
b,
nir_def_last_component_read(&intrin->def) + 1,
intrin->def.bit_size,
nir_imm_int(b, 0),
.base = range_offset + byte_offset - push_range->start * 32,
.range = nir_intrinsic_range(intrin));
nir_def_replace(&intrin->def, data);
_mesa_set_add(state->lowered_ubo_instrs, nir_def_as_intrinsic(data));
return true;
}
static bool
lower_to_push_data_intel(nir_builder *b,
nir_intrinsic_instr *intrin,
void *_data)
{
const struct lower_to_push_data_intel_state *state = _data;
/* With bindless shaders we load uniforms with SEND messages. All the push
* constants are located after the RT_DISPATCH_GLOBALS. We just need to add
* the offset to the address right after RT_DISPATCH_GLOBALS (see
* brw_nir_lower_rt_intrinsics.c).
*/
const unsigned base_offset =
brw_shader_stage_is_bindless(b->shader->info.stage) ?
0 : state->bind_map->push_ranges[0].start * 32;
switch (intrin->intrinsic) {
case nir_intrinsic_load_push_data_intel:
/* For lowered UBOs to push constants, shrink the base by the amount we
* shrunk the driver push constants.
*/
const uint32_t push_reg_mask_start =
anv_drv_const_offset(gfx.push_reg_mask[nir->info.stage]);
const uint32_t push_reg_mask_end =
push_reg_mask_start +
anv_drv_const_size(gfx.push_reg_mask[nir->info.stage]);
push_start = MIN2(push_start, push_reg_mask_start);
push_end = MAX2(push_end, push_reg_mask_end);
if (_mesa_set_search(state->lowered_ubo_instrs, intrin))
nir_intrinsic_set_base(intrin, nir_intrinsic_base(intrin) - state->reduced_push_ranges);
else
nir_intrinsic_set_base(intrin, nir_intrinsic_base(intrin) - base_offset);
return true;
case nir_intrinsic_load_push_constant: {
b->cursor = nir_before_instr(&intrin->instr);
nir_def *data = nir_load_push_data_intel(
b,
intrin->def.num_components,
intrin->def.bit_size,
intrin->src[0].ssa,
.base = nir_intrinsic_base(intrin) - base_offset,
.range = nir_intrinsic_range(intrin));
nir_def_replace(&intrin->def, data);
return true;
}
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
if (push_info->fragment_dynamic) {
const uint32_t fs_config_start =
anv_drv_const_offset(gfx.fs_config);
const uint32_t fs_config_end =
fs_config_start +
anv_drv_const_size(gfx.fs_config);
push_start = MIN2(push_start, fs_config_start);
push_end = MAX2(push_end, fs_config_end);
}
if (needs_wa_18019110168) {
const uint32_t fs_per_prim_remap_start =
anv_drv_const_offset(gfx.fs_per_prim_remap_offset);
const uint32_t fs_per_prim_remap_end =
fs_per_prim_remap_start +
anv_drv_const_size(gfx.fs_per_prim_remap_offset);
push_start = MIN2(push_start, fs_per_prim_remap_start);
push_end = MAX2(push_end, fs_per_prim_remap_end);
}
}
const bool needs_dyn_tess_config =
(nir->info.stage == MESA_SHADER_TESS_CTRL &&
(container_of(prog_key, struct brw_tcs_prog_key, base)->input_vertices == 0 ||
push_info->separate_tessellation)) ||
(nir->info.stage == MESA_SHADER_TESS_EVAL &&
push_info->separate_tessellation);
if (needs_dyn_tess_config) {
const uint32_t tess_config_start = anv_drv_const_offset(gfx.tess_config);
const uint32_t tess_config_end = tess_config_start +
anv_drv_const_size(gfx.tess_config);
push_start = MIN2(push_start, tess_config_start);
push_end = MAX2(push_end, tess_config_end);
default:
return false;
}
}
static struct anv_push_range
compute_final_push_range(const struct intel_device_info *devinfo,
const struct push_data *data)
{
/* Align push_start down to a 32B (for 3DSTATE_CONSTANT) and make it no
* larger than push_end (no push constants is indicated by push_start =
* UINT_MAX).
@ -254,14 +341,50 @@ anv_nir_compute_push_layout(nir_shader *nir,
* (unlike all Gfx stages) and so we can bound+align the allocation there
* (see anv_cmd_buffer_cs_push_constants).
*/
push_start = MIN2(push_start, push_end);
unsigned push_start = UINT32_MAX;
if (data->app_end != 0)
push_start = MIN2(push_start, data->app_start);
if (data->driver_end != 0)
push_start = MIN2(push_start, data->driver_start);
if (push_start == UINT32_MAX) {
return (struct anv_push_range) {
.set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS,
};
}
push_start = ROUND_DOWN_TO(push_start, 32);
const struct anv_push_range push_constant_range = {
const unsigned push_size = align(
MAX2(data->app_end, data->driver_end) - push_start, devinfo->grf_size);
return (struct anv_push_range) {
.set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS,
.start = push_start / 32,
.length = align(push_end - push_start, devinfo->grf_size) / 32,
.length = push_size / 32,
};
}
bool
anv_nir_compute_push_layout(nir_shader *nir,
const struct anv_physical_device *pdevice,
enum brw_robustness_flags robust_flags,
const struct anv_nir_push_layout_info *push_info,
struct brw_base_prog_key *prog_key,
struct brw_stage_prog_data *prog_data,
struct anv_pipeline_bind_map *map,
const struct anv_pipeline_push_map *push_map)
{
const struct brw_compiler *compiler = pdevice->compiler;
const struct intel_device_info *devinfo = compiler->devinfo;
memset(map->push_ranges, 0, sizeof(map->push_ranges));
struct push_data data =
gather_push_data(nir, robust_flags, devinfo, push_info, prog_key, map, NULL);
struct anv_push_range push_constant_range =
compute_final_push_range(devinfo, &data);
/* When platforms support Mesh and the fragment shader is not fully linked
* to the previous shader, payload format can change if the preceding
@ -288,54 +411,40 @@ anv_nir_compute_push_layout(nir_shader *nir,
* dynamic bit in fs_config_intel.
*/
const bool needs_padding_per_primitive =
needs_wa_18019110168 ||
data.needs_wa_18019110168 ||
(push_info->mesh_dynamic &&
(nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID));
unsigned n_push_ranges = 0;
unsigned total_push_regs = 0;
if (push_constant_range.length > 0)
if (push_constant_range.length > 0) {
map->push_ranges[n_push_ranges++] = push_constant_range;
total_push_regs += push_constant_range.length;
}
if (push_ubo_ranges) {
struct brw_ubo_range ubo_ranges[4] = {};
struct anv_push_range analysis_ranges[4] = {};
if (data.push_ubo_ranges) {
anv_nir_analyze_push_constants_ranges(nir, devinfo, push_map,
analysis_ranges);
}
brw_nir_analyze_ubo_ranges(compiler, nir, ubo_ranges);
const unsigned max_push_buffers = needs_padding_per_primitive ? 3 : 4;
const unsigned max_push_regs = needs_padding_per_primitive ? 63 : 64;
const unsigned max_push_regs = 64;
for (unsigned i = 0; i < 4; i++) {
struct anv_push_range *candidate_range = &analysis_ranges[i];
if (n_push_ranges >= max_push_buffers)
break;
unsigned total_push_regs = push_constant_range.length;
for (unsigned i = 0; i < 4; i++) {
if (total_push_regs + ubo_ranges[i].length > max_push_regs)
ubo_ranges[i].length = max_push_regs - total_push_regs;
total_push_regs += ubo_ranges[i].length;
}
assert(total_push_regs <= max_push_regs);
if (candidate_range->length + total_push_regs > max_push_regs)
candidate_range->length = max_push_regs - total_push_regs;
const unsigned max_push_buffers = needs_padding_per_primitive ? 3 : 4;
if (candidate_range->length == 0)
break;
for (unsigned i = 0; i < 4; i++) {
struct brw_ubo_range *ubo_range = &ubo_ranges[i];
if (ubo_range->length == 0)
continue;
if (n_push_ranges >= max_push_buffers) {
memset(ubo_range, 0, sizeof(*ubo_range));
continue;
}
assert(ubo_range->block < push_map->block_count);
const struct anv_pipeline_binding *binding =
&push_map->block_to_descriptor[ubo_range->block];
map->push_ranges[n_push_ranges++] = (struct anv_push_range) {
.set = binding->set,
.index = binding->index,
.dynamic_offset_index = binding->dynamic_offset_index,
.start = ubo_range->start,
.length = ubo_range->length,
};
}
map->push_ranges[n_push_ranges++] = *candidate_range;
total_push_regs += candidate_range->length;
}
/* Pass a single-register push constant payload for the PS stage even if
@ -366,13 +475,44 @@ anv_nir_compute_push_layout(nir_shader *nir,
assert(n_push_ranges <= 4);
struct lower_to_push_data_intel_state lower_state = {
.bind_map = map,
.push_map = push_map,
.lowered_ubo_instrs = _mesa_pointer_set_create(NULL),
};
bool progress = nir_shader_intrinsics_pass(
nir, lower_ubo_to_push_data_intel,
nir_metadata_control_flow, &lower_state);
if (progress && nir_opt_dce(nir)) {
/* Regather the push data */
data = gather_push_data(nir, robust_flags, devinfo, push_info, prog_key,
map, lower_state.lowered_ubo_instrs);
/* Update the ranges */
struct anv_push_range shrinked_push_constant_range =
compute_final_push_range(devinfo, &data);
assert(shrinked_push_constant_range.length <= push_constant_range.length);
if (shrinked_push_constant_range.length > 0) {
map->push_ranges[0] = shrinked_push_constant_range;
} else if (map->push_ranges[0].set == shrinked_push_constant_range.set) {
memmove(&map->push_ranges[0], &map->push_ranges[1], 3 * sizeof(map->push_ranges[0]));
memset(&map->push_ranges[3], 0, sizeof(map->push_ranges[3]));
}
lower_state.reduced_push_ranges = 32 *
(push_constant_range.length - shrinked_push_constant_range.length);
push_constant_range = shrinked_push_constant_range;
}
/* Finally lower the application's push constants & driver' push data */
progress |= nir_shader_intrinsics_pass(
nir, lower_to_push_data_intel,
nir_metadata_control_flow,
&(struct lower_to_push_data_intel_state) {
.bind_map = map,
.push_map = push_map,
});
nir_metadata_control_flow, &lower_state);
ralloc_free(lower_state.lowered_ubo_instrs);
/* Do this before calling brw_cs_fill_push_const_info(), it uses the data
* in prog_data->push_sizes[].
@ -390,17 +530,17 @@ anv_nir_compute_push_layout(nir_shader *nir,
prog_data->push_sizes[i] = map->push_ranges[i].length * 32;
}
unsigned push_start = push_constant_range.start * 32;
if (prog_data->robust_ubo_ranges) {
const uint32_t push_reg_mask_offset =
anv_drv_const_offset(gfx.push_reg_mask[nir->info.stage]);
assert(push_reg_mask_offset >= push_start);
prog_data->push_reg_mask_param =
(push_reg_mask_offset - push_start) / 4;
prog_data->push_reg_mask_param = (push_reg_mask_offset - push_start) / 4;
}
switch (nir->info.stage) {
case MESA_SHADER_TESS_CTRL:
if (needs_dyn_tess_config) {
if (data.needs_dyn_tess_config) {
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
const uint32_t tess_config_offset = anv_drv_const_offset(gfx.tess_config);
@ -429,7 +569,7 @@ anv_nir_compute_push_layout(nir_shader *nir,
assert(fs_config_offset >= push_start);
fs_prog_data->fs_config_param = fs_config_offset - push_start;
}
if (needs_wa_18019110168) {
if (data.needs_wa_18019110168) {
const uint32_t fs_per_prim_remap_offset =
anv_drv_const_offset(gfx.fs_per_prim_remap_offset);
assert(fs_per_prim_remap_offset >= push_start);
@ -441,8 +581,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
case MESA_SHADER_COMPUTE: {
const int subgroup_id_index =
push_end == (anv_drv_const_offset(cs.subgroup_id) +
anv_drv_const_size(cs.subgroup_id)) ?
data.driver_end == (anv_drv_const_offset(cs.subgroup_id) +
anv_drv_const_size(cs.subgroup_id)) ?
(anv_drv_const_offset(cs.subgroup_id) - push_start) / 4 : -1;
struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
brw_cs_fill_push_const_info(devinfo, cs_prog_data, subgroup_id_index);

View file

@ -36,6 +36,9 @@ update_resource_intel_block(nir_builder *b, nir_intrinsic_instr *intrin,
if (intrin->intrinsic != nir_intrinsic_resource_intel)
return false;
if (nir_intrinsic_resource_access_intel(intrin) & nir_resource_intel_internal)
return false;
/* If the array index in the descriptor binding is not const, we won't be
* able to turn this load_ubo into a push constant.
*

View file

@ -0,0 +1,336 @@
/* Copyright © 2026 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "anv_nir.h"
#include "util/u_dynarray.h"
struct push_range_entry
{
struct anv_push_range range;
int benefit;
};
static int
set_score(uint8_t set)
{
/* UBO bindings */
if (set < MAX_SETS)
return 1;
/* Promotion of descriptor data, higher score than UBOs because of inline
* uniforms or data from the descriptor that can be used for later resource
* access.
*/
switch (set) {
case ANV_DESCRIPTOR_SET_DESCRIPTORS: return 3;
default: UNREACHABLE("unexpected push set");
}
}
static int
score(const struct push_range_entry *entry)
{
return 2 * entry->benefit - entry->range.length;
}
/**
* Compares score for two UBO range entries.
*
* For a descending qsort().
*/
static int
cmp_push_range_entry(const void *va, const void *vb)
{
const struct push_range_entry *a = va;
const struct push_range_entry *b = vb;
/* Rank based on scores, descending order */
int delta = score(b) - score(a);
/* Then use promotion type, descending order */
if (delta == 0)
delta = set_score(b->range.set) - set_score(a->range.set);
/* Then use the set index as a tie-breaker, descending order */
if (delta == 0)
delta = b->range.set - a->range.set;
/* Then use the UBO block index as a tie-breaker, descending order */
if (delta == 0)
delta = b->range.index - a->range.index;
/* Finally use the start offset as a second tie-breaker, ascending order */
if (delta == 0)
delta = a->range.start - b->range.start;
return delta;
}
enum push_block_type {
PUSH_BLOCK_TYPE_UBO = 1,
};
struct push_block_key
{
enum push_block_type type;
uint32_t index;
};
struct push_block_info
{
struct push_block_key key;
/* Each bit in the offsets bitfield represents a 32-byte section of data.
* If it's set to one, there is interesting UBO data at that offset. If
* not, there's a "hole" - padding between data - or just nothing at all.
*/
uint64_t offsets;
uint8_t uses[64];
};
struct push_analysis_state
{
const struct intel_device_info *devinfo;
struct hash_table *blocks;
};
static uint32_t
push_block_key_hash(const void *key)
{
return _mesa_hash_data(key, sizeof(struct push_block_key));
}
static bool
push_block_key_compare(const void *key1, const void *key2)
{
return memcmp(key1, key2, sizeof(struct push_block_key)) == 0;
}
static struct push_block_info *
get_block_info(struct push_analysis_state *state,
enum push_block_type type, uint32_t index)
{
struct push_block_key key = { .type = type, .index = index, };
struct hash_entry *entry =
_mesa_hash_table_search(state->blocks, &key);
if (entry)
return (struct push_block_info *) entry->data;
struct push_block_info *info =
rzalloc(state->blocks, struct push_block_info);
info->key = key;
_mesa_hash_table_insert(state->blocks, &info->key, info);
return info;
}
static void
maybe_add_pushable_ubo(struct push_analysis_state *state,
nir_intrinsic_instr *intrin)
{
const int block = anv_nir_get_ubo_binding_push_block(intrin->src[0]);
const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
const int offset = byte_offset / state->devinfo->grf_size;
/* Avoid shifting by larger than the width of our bitfield, as this
* is undefined in C. Even if we require multiple bits to represent
* the entire value, it's OK to record a partial value - the backend
* is capable of falling back to pull loads for later components of
* vectors, as it has to shrink ranges for other reasons anyway.
*/
if (offset >= 64)
return;
/* The value might span multiple GRFs. */
const unsigned num_components =
nir_def_last_component_read(&intrin->def) + 1;
const int bytes = num_components * (intrin->def.bit_size / 8);
const int start = ROUND_DOWN_TO(byte_offset, state->devinfo->grf_size);
const int end = align(byte_offset + bytes, state->devinfo->grf_size);
const int chunks = (end - start) / state->devinfo->grf_size;
/* TODO: should we count uses in loops as higher benefit? */
struct push_block_info *info =
get_block_info(state, PUSH_BLOCK_TYPE_UBO, block);
info->offsets |= ((1ull << chunks) - 1) << offset;
info->uses[offset]++;
}
static void
analyze_pushable_block(struct push_analysis_state *state, nir_block *block)
{
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_ubo:
if (anv_nir_is_promotable_ubo_binding(intrin->src[0]) &&
nir_src_is_const(intrin->src[1]))
maybe_add_pushable_ubo(state, intrin);
break;
default:
break;
}
}
}
static void
print_push_entry(FILE *file,
const struct push_block_info *info,
const struct push_range_entry *entry,
struct push_analysis_state *state)
{
fprintf(file,
"set %2d, index %2d, start %2d, length %2d, bits = %"PRIx64", "
"benefit %2d, cost %2d, score = %2d\n",
entry->range.set, entry->range.index,
entry->range.start, entry->range.length,
info ? info->offsets : 0ul, entry->benefit, entry->range.length, score(entry));
}
void
anv_nir_analyze_push_constants_ranges(nir_shader *nir,
const struct intel_device_info *devinfo,
const struct anv_pipeline_push_map *push_map,
struct anv_push_range out_ranges[4])
{
void *mem_ctx = ralloc_context(NULL);
struct push_analysis_state state = {
.devinfo = devinfo,
.blocks = _mesa_hash_table_create(mem_ctx,
push_block_key_hash,
push_block_key_compare),
};
/* Walk the IR, recording how many times each UBO block/offset is used. */
nir_foreach_function_impl(impl, nir) {
nir_foreach_block(block, impl) {
analyze_pushable_block(&state, block);
}
}
/* Find ranges: a block, starting register-size aligned byte offset, and
* length.
*/
struct util_dynarray ranges;
util_dynarray_init(&ranges, mem_ctx);
hash_table_foreach(state.blocks, entry) {
const struct push_block_info *info = entry->data;
uint64_t offsets = info->offsets;
/* Walk through the offsets bitfield, finding contiguous regions of
* set bits:
*
* 0000000001111111111111000000000000111111111111110000000011111100
* ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^
*
* Each of these will become a UBO range.
*/
while (offsets != 0) {
/* Find the first 1 in the offsets bitfield. This represents the
* start of a range of interesting UBO data. Make it zero-indexed.
*/
int first_bit = ffsll(offsets) - 1;
/* Find the first 0 bit in offsets beyond first_bit. To find the
* first zero bit, we find the first 1 bit in the complement. In
* order to ignore bits before first_bit, we mask off those bits.
*/
int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
if (first_hole == -1) {
/* If we didn't find a hole, then set it to the end of the
* bitfield. There are no more ranges to process.
*/
first_hole = 64;
offsets = 0;
} else {
/* We've processed all bits before first_hole. Mask them off. */
offsets &= ~((1ull << first_hole) - 1);
}
struct push_range_entry *entry =
util_dynarray_grow(&ranges, struct push_range_entry, 1);
assert(info->key.index < push_map->block_count);
const struct anv_pipeline_binding *binding =
&push_map->block_to_descriptor[info->key.index];
entry->range.set = binding->set;
entry->range.index = binding->index;
entry->range.dynamic_offset_index = binding->dynamic_offset_index;
entry->range.start = first_bit;
/* first_hole is one beyond the end, so we don't need to add 1 */
entry->range.length = first_hole - first_bit;
entry->benefit = 0;
for (int i = 0; i < entry->range.length; i++)
entry->benefit += info->uses[first_bit + i];
if (false)
print_push_entry(stderr, info, entry, &state);
}
}
/* TODO: Consider combining ranges.
*
* We can only push 4 ranges via 3DSTATE_CONSTANT_XS. If there are
* more ranges, and two are close by with only a small hole, it may be
* worth combining them. The holes will waste register space, but the
* benefit of removing pulls may outweigh that cost.
*/
/* Sort the list so the most beneficial ranges are at the front. */
int nr_entries = ranges.size / sizeof(struct push_range_entry);
if (nr_entries > 0) {
qsort(ranges.data, nr_entries, sizeof(struct push_range_entry),
cmp_push_range_entry);
}
if (false) {
util_dynarray_foreach(&ranges, struct push_range_entry, entry) {
print_push_entry(stderr, NULL, entry, &state);
}
}
struct push_range_entry *entries = ranges.data;
for (unsigned i = 0; i < nr_entries; i++) {
entries[i].range.start *= devinfo->grf_size / 32;
entries[i].range.length *= devinfo->grf_size / 32;
}
/* Return the top 4, limited to the maximum number of push registers.
*
* The Vulkan driver sets up additional non-UBO push constants, so it may
* need to shrink these ranges further (see anv_nir_compute_push_layout.c).
* The OpenGL driver treats legacy uniforms as a UBO, so this is enough.
*
* To limit further, simply drop the tail of the list, as that's the least
* valuable portion.
*/
const int max_ubos = 4;
nr_entries = MIN2(nr_entries, max_ubos);
const unsigned max_push = 64;
unsigned total_push = 0;
for (unsigned i = 0; i < nr_entries; i++) {
if (total_push + entries[i].range.length > max_push)
entries[i].range.length = max_push - total_push;
total_push += entries[i].range.length;
}
for (int i = 0; i < nr_entries; i++)
out_ranges[i] = entries[i].range;
for (int i = nr_entries; i < 4; i++)
out_ranges[i] = (struct anv_push_range) {};
ralloc_free(ranges.mem_ctx);
}

View file

@ -203,6 +203,10 @@ anv_nir_push_desc_ubo_fully_promoted(nir_shader *nir,
if (nir_intrinsic_desc_set(resource) != push_set)
continue;
/* Skip load_ubo loading the descriptor buffer (not a binding) */
if (nir_intrinsic_binding(resource) == UINT32_MAX)
continue;
uint32_t binding = nir_intrinsic_binding(resource);
/* If we have indirect indexing in the binding, no push promotion

View file

@ -2687,6 +2687,10 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
break;
case ANV_DESCRIPTOR_SET_DESCRIPTORS:
/* We have LSC_SS surface states for this, binding table isn't
* needed.
*/
assert(!cmd_buffer->device->info->has_lsc);
if (shader->bind_map.layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER) {
assert(pipe_state->descriptor_buffers[binding->index].state.alloc_size);
bt_map[s] = pipe_state->descriptor_buffers[binding->index].state.offset +

View file

@ -181,6 +181,7 @@ libanv_files = files(
'anv_nir_lower_ubo_loads.c',
'anv_nir_lower_resource_intel.c',
'anv_nir_lower_unaligned_dispatch.c',
'anv_nir_push_constants_analysis.c',
'anv_nir_push_descriptor_analysis.c',
'anv_perf.c',
'anv_physical_device.c',