anv: add apply_layout support for device bindable shaders/pipelines

We consider them like bindless stages (no binding table) as much as
possible.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31384>
This commit is contained in:
Lionel Landwerlin 2024-06-05 15:48:55 +03:00 committed by Marge Bot
parent af8c85b5bd
commit 6f5d30c0a2
5 changed files with 224 additions and 55 deletions

View file

@ -604,6 +604,7 @@ enum intel_shader_reloc_id {
BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH,
BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH,
BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH,
BRW_SHADER_RELOC_PUSH_DESCRIPTORS_BUFFER_ADDR_HIGH,
BRW_SHADER_RELOC_DESCRIPTORS_VIEW_HANDLE,
BRW_SHADER_RELOC_DESCRIPTORS_BUFFERS_VIEW_HANDLE,
BRW_SHADER_RELOC_INSTRUCTION_BASE_ADDR_HIGH,

View file

@ -113,6 +113,7 @@ bool anv_nir_apply_pipeline_layout(nir_shader *shader,
struct anv_descriptor_set_layout * const *set_layouts,
uint32_t set_count,
const uint32_t *dynamic_offset_start,
bool device_bindable,
struct anv_pipeline_bind_map *map,
struct anv_pipeline_push_map *push_map,
void *push_map_mem_ctx);

View file

@ -64,9 +64,11 @@ struct apply_pipeline_layout_state {
bool uses_constants;
bool has_dynamic_buffers;
bool is_device_bindable;
uint8_t constants_offset;
nir_variable *set_idx_to_bti;
nir_variable *set_idx_to_base_addr;
nir_variable *set_idx_to_offset;
struct {
@ -505,8 +507,6 @@ build_load_descriptor_mem_from_res_index(nir_builder *b,
{
struct res_index_defs res = unpack_res_index(b, res_index);
nir_def *surface_index = nir_load_array_var(b, state->set_idx_to_bti, res.set);
nir_def *offset32 = nir_iadd_imm(
b,
nir_iadd(b,
@ -515,12 +515,24 @@ build_load_descriptor_mem_from_res_index(nir_builder *b,
nir_imul(b, res.array_index, res.desc_stride))),
imm_offset);
return nir_load_ubo(b, num_components, bit_size,
surface_index, offset32,
.align_mul = 8,
.align_offset = imm_offset % 8,
.range_base = 0,
.range = num_components * bit_size / 8);
if (state->pdevice->info.has_lsc || !state->is_device_bindable) {
nir_def *surface_index = nir_load_array_var(b, state->set_idx_to_bti, res.set);
return nir_load_ubo(b, num_components, bit_size,
surface_index, offset32,
.align_mul = 8,
.align_offset = imm_offset % 8,
.range_base = 0,
.range = num_components * bit_size / 8);
} else {
return nir_load_global_constant(
b, num_components, bit_size,
nir_pack_64_2x32_split(b, offset32,
nir_load_array_var(b, state->set_idx_to_base_addr,
res.set)),
.align_mul = 8,
.align_offset = imm_offset % 8);
}
}
/* When using direct descriptor, we do not have a structure to read in memory
@ -704,30 +716,6 @@ build_load_storage_3d_image_depth(nir_builder *b,
}
}
/** Build a 64bit_global_32bit_offset address for a descriptor set */
static nir_def *
build_desc_address64(nir_builder *b, nir_def *set_idx, unsigned set_idx_imm,
const struct apply_pipeline_layout_state *state)
{
nir_def *desc_offset = set_idx != NULL ?
anv_load_driver_uniform_indexed(b, 1, desc_surface_offsets, set_idx) :
anv_load_driver_uniform(b, 1, desc_surface_offsets[set_idx_imm]);
desc_offset = nir_iand_imm(b, desc_offset, ANV_DESCRIPTOR_SET_OFFSET_MASK);
if (state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER &&
!intel_has_extended_bindless(&state->pdevice->info)) {
nir_def *bindless_base_offset =
anv_load_driver_uniform(b, 1, surfaces_base_offset);
desc_offset = nir_iadd(b, bindless_base_offset, desc_offset);
}
return nir_pack_64_2x32_split(
b, desc_offset,
nir_load_reloc_const_intel(
b,
state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER ?
BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH :
BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH));
}
static nir_def *
build_descriptor_set_bti(nir_builder *b,
uint32_t set,
@ -769,12 +757,38 @@ build_descriptor_set_bti(nir_builder *b,
}
}
static nir_def *
build_descriptor_set_base_address(nir_builder *b,
uint32_t set,
const struct apply_pipeline_layout_state *state)
{
const bool is_push_set = set < state->set_count &&
state->set_layouts[set] != NULL &&
(state->set_layouts[set]->vk.flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
enum intel_shader_reloc_id reloc_id;
if (state->pdevice->info.has_lsc) {
reloc_id =
state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER ?
BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH :
BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH;
} else {
reloc_id =
state->bind_map->layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER ?
is_push_set ? BRW_SHADER_RELOC_PUSH_DESCRIPTORS_BUFFER_ADDR_HIGH :
BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH :
BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH;
}
return nir_load_reloc_const_intel(b, reloc_id);
}
static nir_def *
build_descriptor_set_offset(nir_builder *b,
uint32_t set,
const struct apply_pipeline_layout_state *state)
{
if (state->pdevice->info.has_lsc) {
if (state->pdevice->info.has_lsc || state->is_device_bindable) {
return nir_iand_imm(b,
anv_load_driver_uniform(b, 1, desc_surface_offsets[set]),
ANV_DESCRIPTOR_SET_OFFSET_MASK /* array_index */);
@ -801,6 +815,27 @@ build_desc_address32(nir_builder *b,
offset));
}
/** Build a 64bit_bounded_global address for a descriptor set */
static nir_def *
build_desc_address64(nir_builder *b,
nir_def *set_idx, unsigned set,
nir_def *offset,
uint32_t range,
const struct apply_pipeline_layout_state *state)
{
return nir_vec4(b,
nir_iadd(b,
nir_load_array_var(b, state->set_idx_to_offset,
set < MAX_SETS ?
nir_imm_int(b, set) : set_idx),
offset),
nir_load_array_var(b, state->set_idx_to_base_addr,
set < MAX_SETS ?
nir_imm_int(b, set) : set_idx),
nir_imm_int(b, range),
nir_imm_int(b, 0));
}
/** Whether a surface is accessed through the bindless surface state heap */
static bool
is_binding_bindless(unsigned set, unsigned binding, bool sampler,
@ -880,19 +915,6 @@ build_desc_addr_for_binding(nir_builder *b,
return build_desc_address32(b, NULL, set, desc_offset, state);
}
static nir_def *
build_inline_desc_addr32(nir_builder *b,
unsigned set,
const struct anv_descriptor_set_binding_layout *bind_layout,
const struct apply_pipeline_layout_state *state)
{
return nir_vec2(
b,
nir_load_array_var_imm(b, state->set_idx_to_bti, set),
nir_iadd_imm(b, nir_load_array_var_imm(b, state->set_idx_to_offset, set),
bind_layout->descriptor_surface_offset));
}
static unsigned
binding_descriptor_offset(const struct apply_pipeline_layout_state *state,
const struct anv_descriptor_set_binding_layout *bind_layout,
@ -1237,9 +1259,16 @@ build_buffer_addr_for_binding(nir_builder *b,
&state->set_layouts[set]->binding[binding];
if (bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
return build_desc_address32(b, NULL, set,
nir_imm_int(b, bind_layout->descriptor_surface_offset),
state);
if (!state->pdevice->info.has_lsc && state->is_device_bindable) {
return build_desc_address64(b, NULL, set,
nir_imm_int(b, bind_layout->descriptor_surface_offset),
bind_layout->array_size,
state);
} else {
return build_desc_address32(b, NULL, set,
nir_imm_int(b, bind_layout->descriptor_surface_offset),
state);
}
}
if (addr_format != nir_address_format_32bit_index_offset)
@ -1428,9 +1457,6 @@ try_lower_direct_buffer_intrinsic(nir_builder *b,
return false;
}
/* Rewrite to 32bit_index_offset whenever we can */
addr_format = nir_address_format_32bit_index_offset;
/* If a dynamic has not been assigned a binding table entry, we need to
* bail here.
*/
@ -1438,6 +1464,12 @@ try_lower_direct_buffer_intrinsic(nir_builder *b,
!descriptor_has_bti(desc, state))
return false;
/* Rewrite to 32bit_index_offset */
addr_format =
(!state->pdevice->info.has_lsc && state->is_device_bindable) ?
nir_address_format_64bit_bounded_global :
nir_address_format_32bit_index_offset;
nir_def *addr =
build_buffer_addr_for_deref(b, deref, addr_format, state);
@ -2190,6 +2222,115 @@ binding_should_use_sampler_binding_table(const struct apply_pipeline_layout_stat
return true;
}
/* This builds a binding table based on the push descriptor layout, so all the
* shaders using a compatible layout can share the same binding table. For LSC
* platforms the binding table should already by empty (except render
* targets), so it's already compatible.
*/
static void
build_device_bindable_binding_table(struct apply_pipeline_layout_state *state,
nir_shader *shader,
struct anv_pipeline_bind_map *map,
struct anv_pipeline_push_map *push_map,
void *push_map_mem_ctx)
{
/* Compute the amount of push block items required. */
unsigned push_block_count = map->surface_count + MAX_SETS;
for (unsigned s = 0; s < state->set_count; s++) {
const struct anv_descriptor_set_layout *set_layout =
state->set_layouts[s];
if (!set_layout)
continue;
for (unsigned b = 0; b < set_layout->binding_count; b++) {
if (set_layout->binding[b].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
push_block_count += set_layout->binding[b].array_size;
}
}
/* Assign a block index for each surface */
push_map->block_to_descriptor = rzalloc_array(push_map_mem_ctx,
struct anv_pipeline_binding,
push_block_count);
memcpy(push_map->block_to_descriptor,
map->surface_to_descriptor,
sizeof(push_map->block_to_descriptor[0]) * map->surface_count);
push_map->block_count = map->surface_count;
/* No BTI allowed for descriptor sets, we'll use A64 messages. */
for (unsigned s = 0; s < state->set_count; s++) {
state->set[s].desc_offset = BINDLESS_OFFSET;
if (brw_shader_stage_requires_bindless_resources(shader->info.stage)) {
state->set[s].push_block = UINT32_MAX;
} else {
state->set[s].push_block = push_map->block_count;
add_descriptor_push_entry(push_map, s, state->bind_map);
}
}
for (uint32_t set = 0; set < state->set_count; set++) {
struct anv_descriptor_set_layout *set_layout =
state->set_layouts[set];
if (!set_layout)
continue;
for (unsigned b = 0; b < set_layout->binding_count; b++) {
const struct anv_descriptor_set_binding_layout *bind_layout =
&set_layout->binding[b];
/* Assume bindless by default */
state->set[set].binding[b].surface_offset = BINDLESS_OFFSET;
state->set[set].binding[b].sampler_offset = BINDLESS_OFFSET;
if (binding_is_promotable_to_push(set_layout, bind_layout)) {
if (bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
state->set[set].binding[b].push_block = push_map->block_count;
for (unsigned i = 0; i < bind_layout->array_size; i++)
add_push_entry(push_map, set, b, i, bind_layout);
}
}
if (!state->pdevice->info.has_lsc &&
(set_layout->vk.flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) &&
(set_layout->vk.flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT)) {
/* Embedded sampler have an array size limited to 1 */
if (state->set[set].binding[b].properties & BINDING_PROPERTY_EMBEDDED_SAMPLER)
add_embedded_sampler_entry(state, map, set, b);
const uint8_t max_planes = bti_multiplier(state, set, b);
for (unsigned i = 0; i < bind_layout->array_size; i++) {
if (bind_layout->data & ANV_DESCRIPTOR_BTI_SURFACE_STATE) {
const uint8_t max_sampler_planes =
(bind_layout->samplers &&
bind_layout->samplers[i].has_ycbcr_conversion) ?
vk_format_get_plane_count(
bind_layout->samplers[i].ycbcr_conversion_state.format) :
1;
for (uint8_t p = 0; p < max_planes; p++) {
if (p < max_sampler_planes) {
add_bti_entry(map, set, b, i, p, bind_layout);
} else {
add_null_bti_entry(map);
}
}
}
if (!(state->set[set].binding[b].properties & BINDING_PROPERTY_EMBEDDED_SAMPLER) &&
(bind_layout->data & ANV_DESCRIPTOR_BTI_SAMPLER_STATE)) {
for (unsigned i = 0; i < bind_layout->array_size; i++) {
for (uint8_t p = 0; p < max_planes; p++)
add_sampler_entry(map, set, b, i, p, bind_layout);
}
}
}
}
}
}
}
static void
build_packed_binding_table(struct apply_pipeline_layout_state *state,
nir_shader *shader,
@ -2429,6 +2570,22 @@ build_descriptor_sets_bti_array(nir_builder *b,
return set_to_bti;
}
static nir_variable *
build_descriptor_sets_base_addr_array(nir_builder *b,
const struct apply_pipeline_layout_state *state)
{
nir_variable *set_to_base_addr = nir_local_variable_create(
b->impl, glsl_array_type(glsl_uint_type(), MAX_SETS, 0),
"set_to_base_addr");
for (uint32_t i = 0; i < MAX_SETS; i++) {
nir_store_array_var(b, set_to_base_addr, nir_imm_int(b, i),
build_descriptor_set_base_address(b, i, state), 0x1);
}
return set_to_base_addr;
}
static nir_variable *
build_descriptor_sets_offset_array(nir_builder *b,
const struct apply_pipeline_layout_state *state)
@ -2452,6 +2609,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader,
struct anv_descriptor_set_layout * const *set_layouts,
uint32_t set_count,
const uint32_t *dynamic_offset_start,
bool device_bindable,
struct anv_pipeline_bind_map *map,
struct anv_pipeline_push_map *push_map,
void *push_map_mem_ctx)
@ -2473,6 +2631,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader,
.dynamic_offset_start = dynamic_offset_start,
.ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_flags),
.ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_flags),
.is_device_bindable = device_bindable,
};
state.lowered_instrs = _mesa_pointer_set_create(state.mem_ctx);
@ -2491,8 +2650,10 @@ anv_nir_apply_pipeline_layout(nir_shader *shader,
progress |= nir_shader_instructions_pass(shader, get_used_bindings,
nir_metadata_all, &state);
/* Build the binding table */
build_packed_binding_table(&state, shader, map, push_map, push_map_mem_ctx);
if (device_bindable)
build_device_bindable_binding_table(&state, shader, map, push_map, push_map_mem_ctx);
else
build_packed_binding_table(&state, shader, map, push_map, push_map_mem_ctx);
/* Before we do the normal lowering, we look for any SSBO operations
* that we can lower to the BTI model and lower them up-front. The BTI
@ -2527,6 +2688,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader,
nir_foreach_function_impl(impl, shader) {
nir_builder _b = nir_builder_at(nir_before_impl(impl)), *b = &_b;
state.set_idx_to_bti = build_descriptor_sets_bti_array(b, &state);
state.set_idx_to_base_addr = build_descriptor_sets_base_addr_array(b, &state);
state.set_idx_to_offset = build_descriptor_sets_offset_array(b, &state);
progress |= nir_function_instructions_pass(impl,
lower_direct_buffer_instr,

View file

@ -564,6 +564,10 @@ anv_shader_set_relocs(struct anv_device *device,
.id = BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH,
.value = device->physical->va.dynamic_visible_pool.addr >> 32,
};
reloc_values[rv_count++] = (struct intel_shader_reloc_value) {
.id = BRW_SHADER_RELOC_PUSH_DESCRIPTORS_BUFFER_ADDR_HIGH,
.value = device->physical->va.internal_surface_state_pool.addr >> 32,
};
assert((device->physical->va.indirect_descriptor_pool.addr & 0xffffffff) == 0);
assert((device->physical->va.internal_surface_state_pool.addr & 0xffffffff) == 0);
reloc_values[rv_count++] = (struct intel_shader_reloc_value) {

View file

@ -1613,8 +1613,9 @@ anv_shader_lower_nir(struct anv_device *device,
pdevice, shader_data->key.base.robust_flags,
set_layouts, set_layout_count,
(shader_data->info->flags &
VK_SHADER_CREATE_INDEPENDENT_SETS_BIT_MESA) ? NULL:
VK_SHADER_CREATE_INDEPENDENT_SETS_BIT_MESA) ? NULL :
dynamic_descriptors_offsets,
shader_data->info->flags & VK_SHADER_CREATE_INDIRECT_BINDABLE_BIT_EXT,
&shader_data->bind_map, &shader_data->push_map, mem_ctx);
}