From bba428ce3f325ae906a326e8531c5dadc963b355 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Thu, 22 Jan 2026 12:10:40 +0200 Subject: [PATCH] anv: promote push constant pointers to push buffers Signed-off-by: Lionel Landwerlin Reviewed-by: Felix DeGrood Part-of: --- src/intel/vulkan/anv_nir.h | 4 + .../vulkan/anv_nir_compute_push_layout.c | 183 +++++++++++++----- .../vulkan/anv_nir_push_constants_analysis.c | 127 +++++++++++- src/intel/vulkan/anv_private.h | 1 + src/intel/vulkan/anv_shader.c | 6 + src/intel/vulkan/genX_cmd_draw.c | 7 + 6 files changed, 270 insertions(+), 58 deletions(-) diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h index 29ab146259d..f7664e5afb4 100644 --- a/src/intel/vulkan/anv_nir.h +++ b/src/intel/vulkan/anv_nir.h @@ -198,6 +198,10 @@ anv_nir_get_ubo_binding_push_block(nir_src src) return nir_intrinsic_resource_block_intel(intrin); } +bool anv_nir_is_pushable_pointer(nir_intrinsic_instr *intrin, + uint32_t *out_push_offset, + uint32_t *out_load_offset); + void anv_nir_analyze_push_constants_ranges(nir_shader *nir, const struct intel_device_info *devinfo, const struct anv_pipeline_push_map *push_map, diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index 7a560558c55..64557379373 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -31,6 +31,7 @@ struct push_data { bool push_ubo_ranges; + bool push_pointer_ranges; bool needs_wa_18019110168; bool needs_dyn_tess_config; BITSET_DECLARE(push_dwords, PUSH_CONSTANTS_DWORDS); @@ -100,9 +101,11 @@ gather_push_data(nir_shader *nir, const struct anv_nir_push_layout_info *push_info, struct brw_base_prog_key *prog_key, struct anv_pipeline_bind_map *map, - struct set *lowered_ubo_instrs) + struct set *lowered_ubo_instrs, + struct set *lowered_pointer_instrs) { bool has_const_ubo = false; + bool has_const_ptr_ubo = false; struct push_data data = { 0, }; BITSET_ZERO(data.push_dwords); @@ -135,6 +138,11 @@ gather_push_data(nir_shader *nir, has_const_ubo = true; break; } + if (lowered_pointer_instrs && + _mesa_set_search(lowered_pointer_instrs, intrin)) { + has_const_ptr_ubo = true; + break; + } unsigned base = nir_intrinsic_base(intrin); unsigned range = nir_intrinsic_range(intrin); @@ -143,6 +151,13 @@ gather_push_data(nir_shader *nir, break; } + case nir_intrinsic_load_global_constant: { + uint32_t push_offset, load_offset; + if (anv_nir_is_pushable_pointer(intrin, &push_offset, &load_offset)) + has_const_ptr_ubo = true; + break; + } + default: break; } @@ -150,6 +165,10 @@ gather_push_data(nir_shader *nir, } } + data.push_pointer_ranges = + has_const_ptr_ubo && nir->info.stage != MESA_SHADER_COMPUTE && + !brw_shader_stage_requires_bindless_resources(nir->info.stage); + data.push_ubo_ranges = has_const_ubo && nir->info.stage != MESA_SHADER_COMPUTE && !brw_shader_stage_requires_bindless_resources(nir->info.stage); @@ -171,9 +190,10 @@ struct lower_to_push_data_intel_state { const struct anv_pipeline_push_map *push_map; struct set *lowered_ubo_instrs; + struct set *lowered_pointer_instrs; /* Amount that should be subtracted to UBOs loads converted to - * push_data_intel (in lowered_ubo_instrs) + * push_data_intel (in lowered_ubo_or_pointer_instrs) */ unsigned reduced_push_ranges; }; @@ -203,62 +223,115 @@ lower_internal_ubo(nir_builder *b, } static bool -lower_ubo_to_push_data_intel(nir_builder *b, - nir_intrinsic_instr *intrin, - void *_data) +lower_ubo_or_pointer_to_push_data_intel(nir_builder *b, + nir_intrinsic_instr *intrin, + void *_data) { - if (intrin->intrinsic != nir_intrinsic_load_ubo) - return false; + switch (intrin->intrinsic) { + case nir_intrinsic_load_ubo: { + if (!anv_nir_is_promotable_ubo_binding(intrin->src[0]) || + !nir_src_is_const(intrin->src[1]) || + brw_shader_stage_requires_bindless_resources(b->shader->info.stage)) + return lower_internal_ubo(b, intrin); - if (!anv_nir_is_promotable_ubo_binding(intrin->src[0]) || - !nir_src_is_const(intrin->src[1]) || - brw_shader_stage_requires_bindless_resources(b->shader->info.stage)) - return lower_internal_ubo(b, intrin); + const struct lower_to_push_data_intel_state *state = _data; + const int block = anv_nir_get_ubo_binding_push_block(intrin->src[0]); + assert(block < state->push_map->block_count); + const struct anv_pipeline_binding *binding = + &state->push_map->block_to_descriptor[block]; + const unsigned byte_offset = nir_src_as_uint(intrin->src[1]); - const struct lower_to_push_data_intel_state *state = _data; - const int block = anv_nir_get_ubo_binding_push_block(intrin->src[0]); - assert(block < state->push_map->block_count); - const struct anv_pipeline_binding *binding = - &state->push_map->block_to_descriptor[block]; - const unsigned byte_offset = nir_src_as_uint(intrin->src[1]); - const unsigned num_components = - nir_def_last_component_read(&intrin->def) + 1; - const int bytes = num_components * (intrin->def.bit_size / 8); + const unsigned num_components = + nir_def_last_component_read(&intrin->def) + 1; + const int bytes = num_components * (intrin->def.bit_size / 8); - uint32_t range_offset = 0; - const struct anv_push_range *push_range = NULL; - for (uint32_t i = 0; i < 4; i++) { - if (state->bind_map->push_ranges[i].set == binding->set && - state->bind_map->push_ranges[i].index == binding->index && - byte_offset >= state->bind_map->push_ranges[i].start * 32 && - (byte_offset + bytes) <= (state->bind_map->push_ranges[i].start + - state->bind_map->push_ranges[i].length) * 32) { - push_range = &state->bind_map->push_ranges[i]; - break; - } else { - range_offset += state->bind_map->push_ranges[i].length * 32; + uint32_t range_offset = 0; + const struct anv_push_range *push_range = NULL; + for (uint32_t i = 0; i < 4; i++) { + if (state->bind_map->push_ranges[i].set == binding->set && + state->bind_map->push_ranges[i].index == binding->index && + byte_offset >= state->bind_map->push_ranges[i].start * 32 && + (byte_offset + bytes) <= (state->bind_map->push_ranges[i].start + + state->bind_map->push_ranges[i].length) * 32) { + push_range = &state->bind_map->push_ranges[i]; + break; + } else { + range_offset += state->bind_map->push_ranges[i].length * 32; + } } + + if (push_range == NULL) + return lower_internal_ubo(b, intrin); + + b->cursor = nir_before_instr(&intrin->instr); + nir_def *data = nir_load_push_data_intel( + b, + nir_def_last_component_read(&intrin->def) + 1, + intrin->def.bit_size, + nir_imm_int(b, 0), + .base = range_offset + byte_offset - push_range->start * 32, + .range = nir_intrinsic_range(intrin)); + nir_def_replace(&intrin->def, data); + + _mesa_set_add(state->lowered_ubo_instrs, + nir_def_as_intrinsic(data)); + + return true; } - if (push_range == NULL) - return lower_internal_ubo(b, intrin); + case nir_intrinsic_load_global_constant: { + uint32_t push_byte_offset, load_byte_offset; + if (!anv_nir_is_pushable_pointer(intrin, + &push_byte_offset, + &load_byte_offset)) + return false; - assert(!brw_shader_stage_is_bindless(b->shader->info.stage)); - assert(!brw_shader_stage_has_inline_data(state->devinfo, b->shader->info.stage)); + b->cursor = nir_before_instr(&intrin->instr); - b->cursor = nir_before_instr(&intrin->instr); - nir_def *data = nir_load_push_data_intel( - b, - nir_def_last_component_read(&intrin->def) + 1, - intrin->def.bit_size, - nir_imm_int(b, 0), - .base = range_offset + byte_offset - push_range->start * 32, - .range = nir_intrinsic_range(intrin)); - nir_def_replace(&intrin->def, data); + const unsigned num_components = + nir_def_last_component_read(&intrin->def) + 1; + const int bytes = num_components * (intrin->def.bit_size / 8); - _mesa_set_add(state->lowered_ubo_instrs, nir_def_as_intrinsic(data)); + const struct lower_to_push_data_intel_state *state = _data; + uint32_t range_offset = 0; + const struct anv_push_range *push_range = NULL; + for (uint32_t i = 0; i < 4; i++) { + if (state->bind_map->push_ranges[i].set == ANV_DESCRIPTOR_SET_PUSH_POINTER && + state->bind_map->push_ranges[i].index == push_byte_offset && + load_byte_offset >= state->bind_map->push_ranges[i].start * 32 && + (load_byte_offset + bytes) <= (state->bind_map->push_ranges[i].start + + state->bind_map->push_ranges[i].length) * 32) { + push_range = &state->bind_map->push_ranges[i]; + break; + } else { + range_offset += state->bind_map->push_ranges[i].length * 32; + } + } - return true; + if (push_range == NULL) + return false; + + assert(!brw_shader_stage_is_bindless(b->shader->info.stage)); + assert(!brw_shader_stage_has_inline_data(state->devinfo, b->shader->info.stage)); + + b->cursor = nir_before_instr(&intrin->instr); + nir_def *data = nir_load_push_data_intel( + b, + nir_def_last_component_read(&intrin->def) + 1, + intrin->def.bit_size, + nir_imm_int(b, 0), + .base = range_offset + load_byte_offset - push_range->start * 32, + .range = bytes); + nir_def_replace(&intrin->def, data); + _mesa_set_add(state->lowered_pointer_instrs, + nir_def_as_intrinsic(data)); + + return true; + } + + default: + return false; + } } static nir_def * @@ -355,7 +428,8 @@ lower_to_push_data_intel(nir_builder *b, b->cursor = nir_before_instr(&intrin->instr); const unsigned base = nir_intrinsic_base(intrin); - if (_mesa_set_search(state->lowered_ubo_instrs, intrin)) { + if (_mesa_set_search(state->lowered_ubo_instrs, intrin) || + _mesa_set_search(state->lowered_pointer_instrs, intrin)) { /* For lowered UBOs to push constants, shrink the base by the amount * we shrinked the driver push constants. */ @@ -535,7 +609,8 @@ anv_nir_compute_push_layout(nir_shader *nir, memset(map->push_ranges, 0, sizeof(map->push_ranges)); struct push_data data = - gather_push_data(nir, robust_flags, devinfo, push_info, prog_key, map, NULL); + gather_push_data(nir, robust_flags, devinfo, push_info, + prog_key, map, NULL, NULL); struct anv_push_range push_constant_range = compute_final_push_range(nir, devinfo, &data, map); @@ -578,7 +653,7 @@ anv_nir_compute_push_layout(nir_shader *nir, } struct anv_push_range analysis_ranges[4] = {}; - if (data.push_ubo_ranges) { + if (data.push_ubo_ranges || data.push_pointer_ranges) { anv_nir_analyze_push_constants_ranges(nir, devinfo, push_map, analysis_ranges); } @@ -640,16 +715,19 @@ anv_nir_compute_push_layout(nir_shader *nir, .bind_map = map, .push_map = push_map, .lowered_ubo_instrs = _mesa_pointer_set_create(NULL), + .lowered_pointer_instrs = _mesa_pointer_set_create(NULL), }; bool progress = nir_shader_intrinsics_pass( - nir, lower_ubo_to_push_data_intel, + nir, lower_ubo_or_pointer_to_push_data_intel, nir_metadata_control_flow, &lower_state); if (progress && nir_opt_dce(nir)) { /* Regather the push data */ data = gather_push_data(nir, robust_flags, devinfo, push_info, prog_key, - map, lower_state.lowered_ubo_instrs); + map, + lower_state.lowered_ubo_instrs, + lower_state.lowered_pointer_instrs); /* Update the ranges */ struct anv_push_range shrinked_push_constant_range = @@ -674,6 +752,7 @@ anv_nir_compute_push_layout(nir_shader *nir, nir_metadata_control_flow, &lower_state); ralloc_free(lower_state.lowered_ubo_instrs); + ralloc_free(lower_state.lowered_pointer_instrs); /* Do this before calling brw_cs_fill_push_const_info(), it uses the data * in prog_data->push_sizes[]. diff --git a/src/intel/vulkan/anv_nir_push_constants_analysis.c b/src/intel/vulkan/anv_nir_push_constants_analysis.c index ac49f65dd2c..e2fefb80858 100644 --- a/src/intel/vulkan/anv_nir_push_constants_analysis.c +++ b/src/intel/vulkan/anv_nir_push_constants_analysis.c @@ -24,6 +24,7 @@ set_score(uint8_t set) */ switch (set) { case ANV_DESCRIPTOR_SET_DESCRIPTORS: return 3; + case ANV_DESCRIPTOR_SET_PUSH_POINTER: return 3; default: UNREACHABLE("unexpected push set"); } } @@ -69,6 +70,7 @@ cmp_push_range_entry(const void *va, const void *vb) enum push_block_type { PUSH_BLOCK_TYPE_UBO = 1, + PUSH_BLOCK_TYPE_POINTER = 2, }; struct push_block_key @@ -158,6 +160,107 @@ maybe_add_pushable_ubo(struct push_analysis_state *state, info->uses[offset]++; } +/* Chase a pattern like this : + * + * con 32x2 %2 = @load_push_constant (%1 (0x20)) (base=0, range=64, align_mul=256, align_offset=32) + * con 64 %3 = pack_64_2x32_split %2.x, %2.y + * con 64 %4 = load_const (0x000000000000000c = 12) + * con 64 %5 = iadd %4 (0xc), %3 + * con 32 %6 = @load_global_constant (%5) (access=readonly|reorderable, align_mul=4, align_offset=0) + */ +bool +anv_nir_is_pushable_pointer(nir_intrinsic_instr *intrin, + uint32_t *out_push_offset, + uint32_t *out_load_offset) +{ + assert(intrin->intrinsic == nir_intrinsic_load_global_constant); + + if (!(nir_intrinsic_access(intrin) & ACCESS_NON_WRITEABLE)) + return false; + + if (nir_intrinsic_align_mul(intrin) < 32) + return false; + + nir_scalar val = { intrin->src[0].ssa, 0 }; + + /* Extract constant offset if any */ + *out_load_offset = 0; + nir_alu_instr *alu; + if (nir_scalar_is_alu(val) && + (alu = nir_def_as_alu(val.def))->op == nir_op_iadd) { + for (unsigned i = 0; i < 2; ++i) { + nir_scalar add_src = { alu->src[i].src.ssa, alu->src[i].swizzle[val.comp] }; + if (nir_scalar_is_const(add_src)) { + *out_load_offset = nir_scalar_as_uint(add_src); + } else if (val.def == intrin->src[0].ssa) { + /* This is the non constant part of the iadd, if the other source + * is constant, we'll gather the value in the previous if block, + * otherwise we'll give up on this in the next else block. + */ + val = add_src; + } else { + return false; + } + } + } + + /* Unwrap packing + * + * TODO: consider swizzle + */ + if (nir_scalar_is_alu(val)) { + nir_alu_instr *pack_alu = nir_def_as_alu(val.def); + if (pack_alu->op != nir_op_pack_64_2x32_split) + return false; + + val = (nir_scalar){ pack_alu->src[0].src.ssa, pack_alu->src[0].swizzle[0] }; + } + + if (!nir_scalar_is_intrinsic(val)) + return false; + + nir_intrinsic_instr *push_intrin = nir_def_as_intrinsic(val.def); + if (push_intrin->intrinsic != nir_intrinsic_load_push_constant) + return false; + + if (!nir_src_is_const(push_intrin->src[0])) + return false; + + *out_push_offset = nir_intrinsic_base(push_intrin) + + nir_src_as_uint(push_intrin->src[0]); + return true; +} + +static void +add_pushable_pointer(struct push_analysis_state *state, + nir_intrinsic_instr *intrin, + uint32_t push_byte_offset, + uint32_t load_byte_offset) +{ + const int offset = load_byte_offset / state->devinfo->grf_size; + + /* Avoid shifting by larger than the width of our bitfield, as this + * is undefined in C. Even if we require multiple bits to represent + * the entire value, it's OK to record a partial value - the backend + * is capable of falling back to pull loads for later components of + * vectors, as it has to shrink ranges for other reasons anyway. + */ + if (offset >= 64) + return; + + const unsigned num_components = + nir_def_last_component_read(&intrin->def) + 1; + const int bytes = num_components * (intrin->def.bit_size / 8); + const int start = ROUND_DOWN_TO(load_byte_offset, state->devinfo->grf_size); + const int end = align(load_byte_offset + bytes, state->devinfo->grf_size); + const int chunks = (end - start) / state->devinfo->grf_size; + + struct push_block_info *info = + get_block_info(state, PUSH_BLOCK_TYPE_POINTER, push_byte_offset); + info->offsets |= ((1ull << chunks) - 1) << offset; + info->uses[offset]++; +} + static void analyze_pushable_block(struct push_analysis_state *state, nir_block *block) { @@ -173,6 +276,13 @@ analyze_pushable_block(struct push_analysis_state *state, nir_block *block) maybe_add_pushable_ubo(state, intrin); break; + case nir_intrinsic_load_global_constant: { + uint32_t push_offset, load_offset; + if (anv_nir_is_pushable_pointer(intrin, &push_offset, &load_offset)) + add_pushable_pointer(state, intrin, push_offset, load_offset); + break; + } + default: break; } @@ -259,12 +369,17 @@ anv_nir_analyze_push_constants_ranges(nir_shader *nir, struct push_range_entry *entry = util_dynarray_grow(&ranges, struct push_range_entry, 1); - assert(info->key.index < push_map->block_count); - const struct anv_pipeline_binding *binding = - &push_map->block_to_descriptor[info->key.index]; - entry->range.set = binding->set; - entry->range.index = binding->index; - entry->range.dynamic_offset_index = binding->dynamic_offset_index; + if (info->key.type == PUSH_BLOCK_TYPE_UBO) { + assert(info->key.index < push_map->block_count); + const struct anv_pipeline_binding *binding = + &push_map->block_to_descriptor[info->key.index]; + entry->range.set = binding->set; + entry->range.index = binding->index; + entry->range.dynamic_offset_index = binding->dynamic_offset_index; + } else { + entry->range.set = ANV_DESCRIPTOR_SET_PUSH_POINTER; + entry->range.index = info->key.index; + } entry->range.start = first_bit; /* first_hole is one beyond the end, so we don't need to add 1 */ entry->range.length = first_hole - first_bit; diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index bbc8003cda9..f9e812ab8c5 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1039,6 +1039,7 @@ VkResult anv_reloc_list_append(struct anv_reloc_list *list, /* Shaders */ +#define ANV_DESCRIPTOR_SET_PUSH_POINTER (UINT8_MAX - 5) #define ANV_DESCRIPTOR_SET_PER_PRIM_PADDING (UINT8_MAX - 4) #define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 3) #define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 2) diff --git a/src/intel/vulkan/anv_shader.c b/src/intel/vulkan/anv_shader.c index 811a9716551..a0c25c33f49 100644 --- a/src/intel/vulkan/anv_shader.c +++ b/src/intel/vulkan/anv_shader.c @@ -314,6 +314,12 @@ get_shader_bind_map_text(const struct anv_device *device, fprintf(stream, "Per primitive alignment (gfx libs & mesh)"); break; + case ANV_DESCRIPTOR_SET_PUSH_POINTER: + fprintf(stream, "pushed pointer (push_constant_offset=%dB start=%dB)", + bind_map->push_ranges[i].index, + bind_map->push_ranges[i].start * 32); + break; + default: fprintf(stream, "UBO (set=%d binding=%d start=%dB)", bind_map->push_ranges[i].set, diff --git a/src/intel/vulkan/genX_cmd_draw.c b/src/intel/vulkan/genX_cmd_draw.c index c3e02d25e74..24de1fa6170 100644 --- a/src/intel/vulkan/genX_cmd_draw.c +++ b/src/intel/vulkan/genX_cmd_draw.c @@ -204,6 +204,12 @@ get_push_range_address(struct anv_cmd_buffer *cmd_buffer, case ANV_DESCRIPTOR_SET_PER_PRIM_PADDING: return cmd_buffer->device->workaround_address; + case ANV_DESCRIPTOR_SET_PUSH_POINTER: { + uint64_t address = *((uint64_t *)&gfx_state->base.push_constants.client_data[range->index]); + assert(address % ANV_UBO_ALIGNMENT == 0); + return anv_address_from_u64(address); + } + default: { assert(range->set < MAX_SETS); struct anv_descriptor_set *set = @@ -274,6 +280,7 @@ get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer, case ANV_DESCRIPTOR_SET_NULL: case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: case ANV_DESCRIPTOR_SET_PER_PRIM_PADDING: + case ANV_DESCRIPTOR_SET_PUSH_POINTER: return (range->start + range->length) * 32; default: {