diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index ff1eabb2111..7d4153794f5 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -7,6 +7,7 @@ */ #include "util/u_debug.h" +#include "util/u_dynarray.h" #include "util/u_math.h" #include "ir3_compiler.h" @@ -277,31 +278,6 @@ ir3_lower_bit_size(const nir_instr *instr, UNUSED void *data) return 0; } -static void -ir3_get_variable_size_align_bytes(const glsl_type *type, unsigned *size, unsigned *align) -{ - switch (type->base_type) { - case GLSL_TYPE_ARRAY: - case GLSL_TYPE_INTERFACE: - case GLSL_TYPE_STRUCT: - glsl_size_align_handle_array_and_structs(type, ir3_get_variable_size_align_bytes, - size, align); - break; - case GLSL_TYPE_UINT8: - case GLSL_TYPE_INT8: - /* 8-bit values are handled through 16-bit half-registers, so the resulting size - * and alignment value has to be doubled to reflect the actual variable size - * requirement. - */ - *size = 2 * glsl_get_components(type); - *align = 2; - break; - default: - glsl_get_natural_size_align_bytes(type, size, align); - break; - } -} - #define OPT(nir, pass, ...) \ ({ \ bool this_progress = false; \ @@ -1114,6 +1090,174 @@ atomic_supported(const nir_instr * instr, const void * data) return nir_instr_as_intrinsic(instr)->def.bit_size != 64; } +/** + * Like glsl_get_natural_size_align_bytes, but for ir3 RA, where all <32-bit + * components are stored in half regs. + */ +static void +ir3_get_ra_size_align_bytes(const glsl_type *type, unsigned *size, unsigned *align) +{ + switch (type->base_type) { + case GLSL_TYPE_BOOL: + case GLSL_TYPE_UINT8: + case GLSL_TYPE_INT8: + case GLSL_TYPE_UINT16: + case GLSL_TYPE_INT16: + case GLSL_TYPE_FLOAT16: + case GLSL_TYPE_BFLOAT16: + *size = 2 * glsl_get_components(type); + *align = 2; + break; + + case GLSL_TYPE_FLOAT_E4M3FN: + case GLSL_TYPE_FLOAT_E5M2: + case GLSL_TYPE_UINT: + case GLSL_TYPE_INT: + case GLSL_TYPE_FLOAT: + case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_UINT64: + case GLSL_TYPE_INT64: { + unsigned N = glsl_get_bit_size(type) / 8; + *size = N * glsl_get_components(type); + *align = N; + break; + } + + case GLSL_TYPE_ARRAY: + case GLSL_TYPE_INTERFACE: + case GLSL_TYPE_STRUCT: + glsl_size_align_handle_array_and_structs(type, + ir3_get_ra_size_align_bytes, + size, align); + break; + + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_TEXTURE: + case GLSL_TYPE_IMAGE: + /* Bindless samplers and images. */ + *size = 8; + *align = 8; + break; + + case GLSL_TYPE_COOPERATIVE_MATRIX: + case GLSL_TYPE_ATOMIC_UINT: + case GLSL_TYPE_SUBROUTINE: + case GLSL_TYPE_VOID: + case GLSL_TYPE_ERROR: + UNREACHABLE("type does not have a natural size"); + } +} + +static int +variable_size_sort(const void *a, const void *b) +{ + const nir_variable *var_a = *(const nir_variable **)a; + const nir_variable *var_b = *(const nir_variable **)b; + + uint32_t size_a, align_a; + ir3_get_ra_size_align_bytes(var_a->type, &size_a, &align_a); + + uint32_t size_b, align_b; + ir3_get_ra_size_align_bytes(var_b->type, &size_b, &align_b); + + return size_a - size_b; +} + +/* Filters out variables from the set that might go to ir3 RA, in order to avoid + * exceeding the limit of register pressure in a single instruction. + * + * A single instruction could require up to 4 array vars to be fully loaded in + * GPR space: 1 destination and 3 src operands (since we reload full arrays when + * unspilling) + */ +static void +ir3_filter_vars_to_scratch_single_instr_limit(struct set *set, uint32_t limit, + bool limit_for_half) +{ + struct util_dynarray candidate_nonspilled; + util_dynarray_init(&candidate_nonspilled, NULL); + + /* Create an array of vars to potentially not spill sorted by increasing + * size. + */ + set_foreach(set, entry) { + const nir_variable *var = entry->key; + + /* If it's definitely a 32/64-bit array that will be stored in full regs, + * then don't consider it while we're limiting for half-reg accesses. This + * is conservative when we can't figure out the array type, but thanks to + * struct splitting we always successfully determine it on fossils db. + */ + if (glsl_type_is_array_or_matrix(var->type)) { + const struct glsl_type *elem_type = glsl_without_array_or_matrix(var->type); + if (limit_for_half && glsl_type_is_vector_or_scalar(elem_type) && + glsl_get_bit_size(elem_type) > 16) { + continue; + } + } + util_dynarray_append(&candidate_nonspilled, var); + } + + qsort( + util_dynarray_begin(&candidate_nonspilled), + util_dynarray_num_elements(&candidate_nonspilled, const nir_variable *), + sizeof(nir_variable *), variable_size_sort); + + /* Loop removing variables from the set of variables to not spill, until the + * worst case set of variables remaining fit under the limit. + */ + for (;;) { + int last = + util_dynarray_num_elements(&candidate_nonspilled, nir_variable *) - 1; + + uint32_t total_size = 0; + for (int i = last; i >= MAX2(last - 3, 0); i--) { + nir_variable *var = + *util_dynarray_element(&candidate_nonspilled, nir_variable *, i); + uint32_t size, align; + ir3_get_ra_size_align_bytes(var->type, &size, &align); + total_size += size; + } + + if (total_size <= limit) + break; + + nir_variable *var = + util_dynarray_pop(&candidate_nonspilled, nir_variable *); + _mesa_set_remove_key(set, var); + } + + util_dynarray_fini(&candidate_nonspilled); +} + +static void +ir3_vars_to_scratch_cb(struct set *set, void *data) +{ + struct ir3_pressure *limit_pressure = data; + + struct set *nonspilled = _mesa_pointer_set_create(NULL); + set_foreach(set, entry) { + _mesa_set_add(nonspilled, entry->key); + } + /* Filter for the half vars first, which may let the full limit (which + * considers all vars) succeed on vars it wouldn't otherwise. + * + * We decrement the limit for the array's sizes by a vec4's size, because an + * instruction will likely have non-array sources that also need to be + * present, so we can't have the whole register file taken up by an array. + */ + ir3_filter_vars_to_scratch_single_instr_limit( + nonspilled, (limit_pressure->half * 2) - 16, true); + ir3_filter_vars_to_scratch_single_instr_limit( + nonspilled, (limit_pressure->full * 2) - 16, false); + + set_foreach(set, entry) { + const nir_variable *var = entry->key; + if (_mesa_set_search(nonspilled, var)) + _mesa_set_remove_key(set, var); + } +} + /** * Filters the real_wavesize that was set based on API requirements, to an * appopriate value given hardware limits and the NIR shader we get. @@ -1249,9 +1393,10 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, * expensive. */ if (so->compiler->has_pvtmem) { - progress |= OPT(s, nir_lower_vars_to_scratch, - 16 * 16 /* bytes */, - ir3_get_variable_size_align_bytes, glsl_get_natural_size_align_bytes); + struct ir3_pressure limit_pressure = ir3_ra_get_reg_file_limits(so); + progress |= + OPT(s, nir_lower_vars_to_scratch_global, + glsl_get_natural_size_align_bytes, ir3_vars_to_scratch_cb, &limit_pressure); } /* Lower scratch writemasks */