2016-01-02 03:21:28 -08:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2015 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
2022-08-03 12:11:27 -07:00
|
|
|
#include "brw_eu.h"
|
2016-01-02 03:21:28 -08:00
|
|
|
#include "brw_nir.h"
|
|
|
|
|
#include "compiler/nir/nir.h"
|
|
|
|
|
#include "util/u_dynarray.h"
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \file brw_nir_analyze_ubo_ranges.c
|
|
|
|
|
*
|
|
|
|
|
* This pass decides which portions of UBOs to upload as push constants,
|
|
|
|
|
* so shaders can access them as part of the thread payload, rather than
|
|
|
|
|
* having to issue expensive memory reads to pull the data.
|
|
|
|
|
*
|
|
|
|
|
* The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
|
2022-08-03 12:11:27 -07:00
|
|
|
* buffers, in GRF sized units. This was always 256 bits (32 bytes).
|
|
|
|
|
* Starting with Xe2, it is 512 bits (64 bytes).
|
2016-01-02 03:21:28 -08:00
|
|
|
*
|
|
|
|
|
* To do this, we examine NIR load_ubo intrinsics, recording the number of
|
2022-08-03 12:11:27 -07:00
|
|
|
* loads at each offset. We track offsets at a sizeof(GRF) granularity, so even
|
2016-01-02 03:21:28 -08:00
|
|
|
* fields with a bit of padding between them tend to fall into contiguous
|
|
|
|
|
* ranges. We build a list of these ranges, tracking their "cost" (number
|
|
|
|
|
* of registers required) and "benefit" (number of pull loads eliminated
|
|
|
|
|
* by pushing the range). We then sort the list to obtain the four best
|
|
|
|
|
* ranges (most benefit for the least cost).
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
struct ubo_range_entry
|
|
|
|
|
{
|
|
|
|
|
struct brw_ubo_range range;
|
|
|
|
|
int benefit;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
score(const struct ubo_range_entry *entry)
|
|
|
|
|
{
|
|
|
|
|
return 2 * entry->benefit - entry->range.length;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Compares score for two UBO range entries.
|
|
|
|
|
*
|
|
|
|
|
* For a descending qsort().
|
|
|
|
|
*/
|
|
|
|
|
static int
|
|
|
|
|
cmp_ubo_range_entry(const void *va, const void *vb)
|
|
|
|
|
{
|
|
|
|
|
const struct ubo_range_entry *a = va;
|
|
|
|
|
const struct ubo_range_entry *b = vb;
|
|
|
|
|
|
2022-11-13 16:19:48 -08:00
|
|
|
/* Rank based on scores, descending order */
|
2016-01-02 03:21:28 -08:00
|
|
|
int delta = score(b) - score(a);
|
|
|
|
|
|
2022-11-13 16:19:48 -08:00
|
|
|
/* Then use the UBO block index as a tie-breaker, descending order */
|
2016-01-02 03:21:28 -08:00
|
|
|
if (delta == 0)
|
|
|
|
|
delta = b->range.block - a->range.block;
|
|
|
|
|
|
2022-11-13 16:19:48 -08:00
|
|
|
/* Finally use the start offset as a second tie-breaker, ascending order */
|
2016-01-02 03:21:28 -08:00
|
|
|
if (delta == 0)
|
2022-11-13 16:19:48 -08:00
|
|
|
delta = a->range.start - b->range.start;
|
2016-01-02 03:21:28 -08:00
|
|
|
|
|
|
|
|
return delta;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct ubo_block_info
|
|
|
|
|
{
|
|
|
|
|
/* Each bit in the offsets bitfield represents a 32-byte section of data.
|
|
|
|
|
* If it's set to one, there is interesting UBO data at that offset. If
|
|
|
|
|
* not, there's a "hole" - padding between data - or just nothing at all.
|
|
|
|
|
*/
|
|
|
|
|
uint64_t offsets;
|
|
|
|
|
uint8_t uses[64];
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct ubo_analysis_state
|
|
|
|
|
{
|
|
|
|
|
struct hash_table *blocks;
|
2022-08-03 12:11:27 -07:00
|
|
|
const struct intel_device_info *devinfo;
|
2016-01-02 03:21:28 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static struct ubo_block_info *
|
|
|
|
|
get_block_info(struct ubo_analysis_state *state, int block)
|
|
|
|
|
{
|
|
|
|
|
uint32_t hash = block + 1;
|
|
|
|
|
void *key = (void *) (uintptr_t) hash;
|
|
|
|
|
|
|
|
|
|
struct hash_entry *entry =
|
|
|
|
|
_mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
|
|
|
|
|
|
|
|
|
|
if (entry)
|
|
|
|
|
return (struct ubo_block_info *) entry->data;
|
|
|
|
|
|
|
|
|
|
struct ubo_block_info *info =
|
|
|
|
|
rzalloc(state->blocks, struct ubo_block_info);
|
|
|
|
|
_mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
|
|
|
|
|
|
|
|
|
|
return info;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
|
|
|
|
|
{
|
|
|
|
|
nir_foreach_instr(instr, block) {
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
2024-10-15 04:52:20 -07:00
|
|
|
if (intrin->intrinsic != nir_intrinsic_load_ubo)
|
2016-01-02 03:21:28 -08:00
|
|
|
continue;
|
|
|
|
|
|
2022-12-27 11:26:02 +02:00
|
|
|
if (brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) &&
|
2018-10-20 12:21:46 -05:00
|
|
|
nir_src_is_const(intrin->src[1])) {
|
2022-12-27 11:26:02 +02:00
|
|
|
const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]);
|
2018-10-20 12:21:46 -05:00
|
|
|
const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
|
2022-08-03 12:11:27 -07:00
|
|
|
const unsigned sizeof_GRF = REG_SIZE * reg_unit(state->devinfo);
|
|
|
|
|
const int offset = byte_offset / sizeof_GRF;
|
2016-01-02 03:21:28 -08:00
|
|
|
|
intel/compiler: Properly consider UBO loads that cross 32B boundaries.
The UBO push analysis pass incorrectly assumed that all values would fit
within a 32B chunk, and only recorded a bit for the 32B chunk containing
the starting offset.
For example, if a UBO contained the following, tightly packed:
vec4 a; // [0, 16)
float b; // [16, 20)
vec4 c; // [20, 36)
then, c would start at offset 20 / 32 = 0 and end at 36 / 32 = 1,
which means that we ought to record two 32B chunks in the bitfield.
Similarly, dvec4s would suffer from the same problem.
v2: Rewrite the accounting, my calculations were wrong.
v3: Write a comment about partial values (requested by Jason).
Reviewed-by: Rafael Antognolli <rafael.antognolli@intel.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> [v3]
2018-06-08 14:24:16 -07:00
|
|
|
/* Avoid shifting by larger than the width of our bitfield, as this
|
|
|
|
|
* is undefined in C. Even if we require multiple bits to represent
|
|
|
|
|
* the entire value, it's OK to record a partial value - the backend
|
|
|
|
|
* is capable of falling back to pull loads for later components of
|
|
|
|
|
* vectors, as it has to shrink ranges for other reasons anyway.
|
|
|
|
|
*/
|
2016-01-02 03:21:28 -08:00
|
|
|
if (offset >= 64)
|
|
|
|
|
continue;
|
|
|
|
|
|
2022-08-03 12:11:27 -07:00
|
|
|
/* The value might span multiple sizeof(GRF) chunks. */
|
2024-10-15 05:06:19 -07:00
|
|
|
const unsigned num_components =
|
|
|
|
|
nir_def_last_component_read(&intrin->def) + 1;
|
|
|
|
|
const int bytes = num_components * (intrin->def.bit_size / 8);
|
2022-08-03 12:11:27 -07:00
|
|
|
const int start = ROUND_DOWN_TO(byte_offset, sizeof_GRF);
|
|
|
|
|
const int end = ALIGN(byte_offset + bytes, sizeof_GRF);
|
|
|
|
|
const int chunks = (end - start) / sizeof_GRF;
|
intel/compiler: Properly consider UBO loads that cross 32B boundaries.
The UBO push analysis pass incorrectly assumed that all values would fit
within a 32B chunk, and only recorded a bit for the 32B chunk containing
the starting offset.
For example, if a UBO contained the following, tightly packed:
vec4 a; // [0, 16)
float b; // [16, 20)
vec4 c; // [20, 36)
then, c would start at offset 20 / 32 = 0 and end at 36 / 32 = 1,
which means that we ought to record two 32B chunks in the bitfield.
Similarly, dvec4s would suffer from the same problem.
v2: Rewrite the accounting, my calculations were wrong.
v3: Write a comment about partial values (requested by Jason).
Reviewed-by: Rafael Antognolli <rafael.antognolli@intel.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> [v3]
2018-06-08 14:24:16 -07:00
|
|
|
|
2016-01-02 03:21:28 -08:00
|
|
|
/* TODO: should we count uses in loops as higher benefit? */
|
|
|
|
|
|
|
|
|
|
struct ubo_block_info *info = get_block_info(state, block);
|
intel/compiler: Properly consider UBO loads that cross 32B boundaries.
The UBO push analysis pass incorrectly assumed that all values would fit
within a 32B chunk, and only recorded a bit for the 32B chunk containing
the starting offset.
For example, if a UBO contained the following, tightly packed:
vec4 a; // [0, 16)
float b; // [16, 20)
vec4 c; // [20, 36)
then, c would start at offset 20 / 32 = 0 and end at 36 / 32 = 1,
which means that we ought to record two 32B chunks in the bitfield.
Similarly, dvec4s would suffer from the same problem.
v2: Rewrite the accounting, my calculations were wrong.
v3: Write a comment about partial values (requested by Jason).
Reviewed-by: Rafael Antognolli <rafael.antognolli@intel.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> [v3]
2018-06-08 14:24:16 -07:00
|
|
|
info->offsets |= ((1ull << chunks) - 1) << offset;
|
2016-01-02 03:21:28 -08:00
|
|
|
info->uses[offset]++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
print_ubo_entry(FILE *file,
|
|
|
|
|
const struct ubo_range_entry *entry,
|
|
|
|
|
struct ubo_analysis_state *state)
|
|
|
|
|
{
|
|
|
|
|
struct ubo_block_info *info = get_block_info(state, entry->range.block);
|
|
|
|
|
|
|
|
|
|
fprintf(file,
|
2018-02-03 23:59:05 +02:00
|
|
|
"block %2d, start %2d, length %2d, bits = %"PRIx64", "
|
2016-01-02 03:21:28 -08:00
|
|
|
"benefit %2d, cost %2d, score = %2d\n",
|
|
|
|
|
entry->range.block, entry->range.start, entry->range.length,
|
|
|
|
|
info->offsets, entry->benefit, entry->range.length, score(entry));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
|
|
|
|
|
nir_shader *nir,
|
|
|
|
|
struct brw_ubo_range out_ranges[4])
|
|
|
|
|
{
|
|
|
|
|
void *mem_ctx = ralloc_context(NULL);
|
|
|
|
|
|
|
|
|
|
struct ubo_analysis_state state = {
|
|
|
|
|
.blocks =
|
|
|
|
|
_mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
|
2022-08-03 12:11:27 -07:00
|
|
|
.devinfo = compiler->devinfo,
|
2016-01-02 03:21:28 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/* Walk the IR, recording how many times each UBO block/offset is used. */
|
2023-06-28 19:40:56 +08:00
|
|
|
nir_foreach_function_impl(impl, nir) {
|
|
|
|
|
nir_foreach_block(block, impl) {
|
|
|
|
|
analyze_ubos_block(&state, block);
|
2016-01-02 03:21:28 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-03 12:11:27 -07:00
|
|
|
/* Find ranges: a block, starting register-size aligned byte offset, and
|
|
|
|
|
* length.
|
|
|
|
|
*/
|
2016-01-02 03:21:28 -08:00
|
|
|
struct util_dynarray ranges;
|
|
|
|
|
util_dynarray_init(&ranges, mem_ctx);
|
|
|
|
|
|
|
|
|
|
hash_table_foreach(state.blocks, entry) {
|
|
|
|
|
const int b = entry->hash - 1;
|
|
|
|
|
const struct ubo_block_info *info = entry->data;
|
|
|
|
|
uint64_t offsets = info->offsets;
|
|
|
|
|
|
|
|
|
|
/* Walk through the offsets bitfield, finding contiguous regions of
|
|
|
|
|
* set bits:
|
|
|
|
|
*
|
|
|
|
|
* 0000000001111111111111000000000000111111111111110000000011111100
|
|
|
|
|
* ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^
|
|
|
|
|
*
|
|
|
|
|
* Each of these will become a UBO range.
|
|
|
|
|
*/
|
|
|
|
|
while (offsets != 0) {
|
|
|
|
|
/* Find the first 1 in the offsets bitfield. This represents the
|
|
|
|
|
* start of a range of interesting UBO data. Make it zero-indexed.
|
|
|
|
|
*/
|
|
|
|
|
int first_bit = ffsll(offsets) - 1;
|
|
|
|
|
|
|
|
|
|
/* Find the first 0 bit in offsets beyond first_bit. To find the
|
|
|
|
|
* first zero bit, we find the first 1 bit in the complement. In
|
|
|
|
|
* order to ignore bits before first_bit, we mask off those bits.
|
|
|
|
|
*/
|
|
|
|
|
int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
|
|
|
|
|
|
|
|
|
|
if (first_hole == -1) {
|
|
|
|
|
/* If we didn't find a hole, then set it to the end of the
|
|
|
|
|
* bitfield. There are no more ranges to process.
|
|
|
|
|
*/
|
|
|
|
|
first_hole = 64;
|
|
|
|
|
offsets = 0;
|
|
|
|
|
} else {
|
|
|
|
|
/* We've processed all bits before first_hole. Mask them off. */
|
|
|
|
|
offsets &= ~((1ull << first_hole) - 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct ubo_range_entry *entry =
|
u_dynarray: turn util_dynarray_{grow, resize} into element-oriented macros
The main motivation for this change is API ergonomics: most operations
on dynarrays are really on elements, not on bytes, so it's weird to have
grow and resize as the odd operations out.
The secondary motivation is memory safety. Users of the old byte-oriented
functions would often multiply a number of elements with the element size,
which could overflow, and checking for overflow is tedious.
With this change, we only need to implement the overflow checks once.
The checks are cheap: since eltsize is a compile-time constant and the
functions should be inlined, they only add a single comparison and an
unlikely branch.
v2:
- ensure operations are no-op when allocation fails
- in util_dynarray_clone, call resize_bytes with a compile-time constant element size
v3:
- fix iris, lima, panfrost
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
2019-05-13 16:58:08 +02:00
|
|
|
util_dynarray_grow(&ranges, struct ubo_range_entry, 1);
|
2016-01-02 03:21:28 -08:00
|
|
|
|
|
|
|
|
entry->range.block = b;
|
|
|
|
|
entry->range.start = first_bit;
|
|
|
|
|
/* first_hole is one beyond the end, so we don't need to add 1 */
|
|
|
|
|
entry->range.length = first_hole - first_bit;
|
|
|
|
|
entry->benefit = 0;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < entry->range.length; i++)
|
|
|
|
|
entry->benefit += info->uses[first_bit + i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
|
|
|
|
|
|
|
|
|
|
if (0) {
|
|
|
|
|
util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
|
|
|
|
|
print_ubo_entry(stderr, entry, &state);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* TODO: Consider combining ranges.
|
|
|
|
|
*
|
2024-10-15 04:52:20 -07:00
|
|
|
* We can only push 4 ranges via 3DSTATE_CONSTANT_XS. If there are
|
2016-01-02 03:21:28 -08:00
|
|
|
* more ranges, and two are close by with only a small hole, it may be
|
|
|
|
|
* worth combining them. The holes will waste register space, but the
|
|
|
|
|
* benefit of removing pulls may outweigh that cost.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* Sort the list so the most beneficial ranges are at the front. */
|
2020-01-28 16:30:55 +02:00
|
|
|
if (nr_entries > 0) {
|
|
|
|
|
qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
|
|
|
|
|
cmp_ubo_range_entry);
|
|
|
|
|
}
|
2016-01-02 03:21:28 -08:00
|
|
|
|
|
|
|
|
struct ubo_range_entry *entries = ranges.data;
|
|
|
|
|
|
2025-01-01 23:38:14 -08:00
|
|
|
/* Return the top 4, limited to the maximum number of push registers.
|
2016-01-02 03:21:28 -08:00
|
|
|
*
|
2025-01-01 23:38:14 -08:00
|
|
|
* The Vulkan driver sets up additional non-UBO push constants, so it may
|
|
|
|
|
* need to shrink these ranges further (see anv_nir_compute_push_layout.c).
|
|
|
|
|
* The OpenGL driver treats legacy uniforms as a UBO, so this is enough.
|
|
|
|
|
*
|
|
|
|
|
* To limit further, simply drop the tail of the list, as that's the least
|
|
|
|
|
* valuable portion.
|
2016-01-02 03:21:28 -08:00
|
|
|
*/
|
2024-10-15 04:52:20 -07:00
|
|
|
const int max_ubos = 4;
|
2016-01-02 03:21:28 -08:00
|
|
|
nr_entries = MIN2(nr_entries, max_ubos);
|
|
|
|
|
|
2025-01-07 13:00:27 +02:00
|
|
|
const unsigned max_push_regs = 64 / reg_unit(compiler->devinfo);
|
2025-01-01 23:38:14 -08:00
|
|
|
unsigned total_push_regs = 0;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < nr_entries; i++) {
|
|
|
|
|
if (total_push_regs + entries[i].range.length > max_push_regs)
|
|
|
|
|
entries[i].range.length = max_push_regs - total_push_regs;
|
|
|
|
|
total_push_regs += entries[i].range.length;
|
|
|
|
|
}
|
|
|
|
|
|
2016-01-02 03:21:28 -08:00
|
|
|
for (int i = 0; i < nr_entries; i++) {
|
|
|
|
|
out_ranges[i] = entries[i].range;
|
2022-08-03 12:11:27 -07:00
|
|
|
|
|
|
|
|
/* To this point, various values have been tracked in terms of the real
|
|
|
|
|
* hardware register sizes. However, the rest of the compiler expects
|
|
|
|
|
* values in terms of pre-Xe2 256-bit registers. Scale start and length
|
|
|
|
|
* to account for this.
|
|
|
|
|
*/
|
|
|
|
|
out_ranges[i].start *= reg_unit(compiler->devinfo);
|
|
|
|
|
out_ranges[i].length *= reg_unit(compiler->devinfo);
|
2016-01-02 03:21:28 -08:00
|
|
|
}
|
|
|
|
|
for (int i = nr_entries; i < 4; i++) {
|
|
|
|
|
out_ranges[i].block = 0;
|
|
|
|
|
out_ranges[i].start = 0;
|
|
|
|
|
out_ranges[i].length = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ralloc_free(ranges.mem_ctx);
|
|
|
|
|
}
|