/* Copyright © 2026 Intel Corporation * SPDX-License-Identifier: MIT */ #include "anv_nir.h" #include "util/u_dynarray.h" struct push_range_entry { struct anv_push_range range; int benefit; }; static int set_score(uint8_t set) { /* UBO bindings */ if (set < MAX_SETS) return 1; /* Promotion of descriptor data, higher score than UBOs because of inline * uniforms or data from the descriptor that can be used for later resource * access. */ switch (set) { case ANV_DESCRIPTOR_SET_DESCRIPTORS: return 3; default: UNREACHABLE("unexpected push set"); } } static int score(const struct push_range_entry *entry) { return 2 * entry->benefit - entry->range.length; } /** * Compares score for two UBO range entries. * * For a descending qsort(). */ static int cmp_push_range_entry(const void *va, const void *vb) { const struct push_range_entry *a = va; const struct push_range_entry *b = vb; /* Rank based on scores, descending order */ int delta = score(b) - score(a); /* Then use promotion type, descending order */ if (delta == 0) delta = set_score(b->range.set) - set_score(a->range.set); /* Then use the set index as a tie-breaker, descending order */ if (delta == 0) delta = b->range.set - a->range.set; /* Then use the UBO block index as a tie-breaker, descending order */ if (delta == 0) delta = b->range.index - a->range.index; /* Finally use the start offset as a second tie-breaker, ascending order */ if (delta == 0) delta = a->range.start - b->range.start; return delta; } enum push_block_type { PUSH_BLOCK_TYPE_UBO = 1, }; struct push_block_key { enum push_block_type type; uint32_t index; }; struct push_block_info { struct push_block_key key; /* Each bit in the offsets bitfield represents a 32-byte section of data. * If it's set to one, there is interesting UBO data at that offset. If * not, there's a "hole" - padding between data - or just nothing at all. */ uint64_t offsets; uint8_t uses[64]; }; struct push_analysis_state { const struct intel_device_info *devinfo; struct hash_table *blocks; }; static uint32_t push_block_key_hash(const void *key) { return _mesa_hash_data(key, sizeof(struct push_block_key)); } static bool push_block_key_compare(const void *key1, const void *key2) { return memcmp(key1, key2, sizeof(struct push_block_key)) == 0; } static struct push_block_info * get_block_info(struct push_analysis_state *state, enum push_block_type type, uint32_t index) { struct push_block_key key = { .type = type, .index = index, }; struct hash_entry *entry = _mesa_hash_table_search(state->blocks, &key); if (entry) return (struct push_block_info *) entry->data; struct push_block_info *info = rzalloc(state->blocks, struct push_block_info); info->key = key; _mesa_hash_table_insert(state->blocks, &info->key, info); return info; } static void maybe_add_pushable_ubo(struct push_analysis_state *state, nir_intrinsic_instr *intrin) { const int block = anv_nir_get_ubo_binding_push_block(intrin->src[0]); const unsigned byte_offset = nir_src_as_uint(intrin->src[1]); const int offset = byte_offset / state->devinfo->grf_size; /* Avoid shifting by larger than the width of our bitfield, as this * is undefined in C. Even if we require multiple bits to represent * the entire value, it's OK to record a partial value - the backend * is capable of falling back to pull loads for later components of * vectors, as it has to shrink ranges for other reasons anyway. */ if (offset >= 64) return; /* The value might span multiple GRFs. */ const unsigned num_components = nir_def_last_component_read(&intrin->def) + 1; const int bytes = num_components * (intrin->def.bit_size / 8); const int start = ROUND_DOWN_TO(byte_offset, state->devinfo->grf_size); const int end = align(byte_offset + bytes, state->devinfo->grf_size); const int chunks = (end - start) / state->devinfo->grf_size; /* TODO: should we count uses in loops as higher benefit? */ struct push_block_info *info = get_block_info(state, PUSH_BLOCK_TYPE_UBO, block); info->offsets |= ((1ull << chunks) - 1) << offset; info->uses[offset]++; } static void analyze_pushable_block(struct push_analysis_state *state, nir_block *block) { nir_foreach_instr(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_load_ubo: if (anv_nir_is_promotable_ubo_binding(intrin->src[0]) && nir_src_is_const(intrin->src[1])) maybe_add_pushable_ubo(state, intrin); break; default: break; } } } static void print_push_entry(FILE *file, const struct push_block_info *info, const struct push_range_entry *entry, struct push_analysis_state *state) { fprintf(file, "set %2d, index %2d, start %2d, length %2d, bits = %"PRIx64", " "benefit %2d, cost %2d, score = %2d\n", entry->range.set, entry->range.index, entry->range.start, entry->range.length, info ? info->offsets : 0ul, entry->benefit, entry->range.length, score(entry)); } void anv_nir_analyze_push_constants_ranges(nir_shader *nir, const struct intel_device_info *devinfo, const struct anv_pipeline_push_map *push_map, struct anv_push_range out_ranges[4]) { void *mem_ctx = ralloc_context(NULL); struct push_analysis_state state = { .devinfo = devinfo, .blocks = _mesa_hash_table_create(mem_ctx, push_block_key_hash, push_block_key_compare), }; /* Walk the IR, recording how many times each UBO block/offset is used. */ nir_foreach_function_impl(impl, nir) { nir_foreach_block(block, impl) { analyze_pushable_block(&state, block); } } /* Find ranges: a block, starting register-size aligned byte offset, and * length. */ struct util_dynarray ranges; util_dynarray_init(&ranges, mem_ctx); hash_table_foreach(state.blocks, entry) { const struct push_block_info *info = entry->data; uint64_t offsets = info->offsets; /* Walk through the offsets bitfield, finding contiguous regions of * set bits: * * 0000000001111111111111000000000000111111111111110000000011111100 * ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^ * * Each of these will become a UBO range. */ while (offsets != 0) { /* Find the first 1 in the offsets bitfield. This represents the * start of a range of interesting UBO data. Make it zero-indexed. */ int first_bit = ffsll(offsets) - 1; /* Find the first 0 bit in offsets beyond first_bit. To find the * first zero bit, we find the first 1 bit in the complement. In * order to ignore bits before first_bit, we mask off those bits. */ int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1; if (first_hole == -1) { /* If we didn't find a hole, then set it to the end of the * bitfield. There are no more ranges to process. */ first_hole = 64; offsets = 0; } else { /* We've processed all bits before first_hole. Mask them off. */ offsets &= ~((1ull << first_hole) - 1); } struct push_range_entry *entry = util_dynarray_grow(&ranges, struct push_range_entry, 1); assert(info->key.index < push_map->block_count); const struct anv_pipeline_binding *binding = &push_map->block_to_descriptor[info->key.index]; entry->range.set = binding->set; entry->range.index = binding->index; entry->range.dynamic_offset_index = binding->dynamic_offset_index; entry->range.start = first_bit; /* first_hole is one beyond the end, so we don't need to add 1 */ entry->range.length = first_hole - first_bit; entry->benefit = 0; for (int i = 0; i < entry->range.length; i++) entry->benefit += info->uses[first_bit + i]; if (false) print_push_entry(stderr, info, entry, &state); } } /* TODO: Consider combining ranges. * * We can only push 4 ranges via 3DSTATE_CONSTANT_XS. If there are * more ranges, and two are close by with only a small hole, it may be * worth combining them. The holes will waste register space, but the * benefit of removing pulls may outweigh that cost. */ /* Sort the list so the most beneficial ranges are at the front. */ int nr_entries = ranges.size / sizeof(struct push_range_entry); if (nr_entries > 0) { qsort(ranges.data, nr_entries, sizeof(struct push_range_entry), cmp_push_range_entry); } if (false) { util_dynarray_foreach(&ranges, struct push_range_entry, entry) { print_push_entry(stderr, NULL, entry, &state); } } struct push_range_entry *entries = ranges.data; for (unsigned i = 0; i < nr_entries; i++) { entries[i].range.start *= devinfo->grf_size / 32; entries[i].range.length *= devinfo->grf_size / 32; } /* Return the top 4, limited to the maximum number of push registers. * * The Vulkan driver sets up additional non-UBO push constants, so it may * need to shrink these ranges further (see anv_nir_compute_push_layout.c). * The OpenGL driver treats legacy uniforms as a UBO, so this is enough. * * To limit further, simply drop the tail of the list, as that's the least * valuable portion. */ const int max_ubos = 4; nr_entries = MIN2(nr_entries, max_ubos); const unsigned max_push = 64; unsigned total_push = 0; for (unsigned i = 0; i < nr_entries; i++) { if (total_push + entries[i].range.length > max_push) entries[i].range.length = max_push - total_push; total_push += entries[i].range.length; } for (int i = 0; i < nr_entries; i++) out_ranges[i] = entries[i].range; for (int i = nr_entries; i < 4; i++) out_ranges[i] = (struct anv_push_range) {}; ralloc_free(ranges.mem_ctx); }