broadcom/compiler: sort constant UBO loads by index and offset

This implements a NIR pass that groups together constant UBO loads
for the same UBO index in order of increasing offset when the distance
between them is small enough that it enables the "skip unifa write"
optimization.

This may increase register pressure because it can move UBO loads
earlier, so we also add a compiler strategy fallback to disable the
optimization if we need to drop thread count to compile the shader
with this optimization enabled.

total instructions in shared programs: 13557555 -> 13550300 (-0.05%)
instructions in affected programs: 814684 -> 807429 (-0.89%)
helped: 4485
HURT: 2377
Instructions are helped.

total uniforms in shared programs: 3777243 -> 3760990 (-0.43%)
uniforms in affected programs: 112554 -> 96301 (-14.44%)
helped: 7226
HURT: 36
Uniforms are helped.

total max-temps in shared programs: 2318133 -> 2333761 (0.67%)
max-temps in affected programs: 63230 -> 78858 (24.72%)
helped: 23
HURT: 3044
Max-temps are HURT.

total sfu-stalls in shared programs: 32245 -> 32567 (1.00%)
sfu-stalls in affected programs: 389 -> 711 (82.78%)
helped: 139
HURT: 451
Inconclusive result.

total inst-and-stalls in shared programs: 13589800 -> 13582867 (-0.05%)
inst-and-stalls in affected programs: 817738 -> 810805 (-0.85%)
helped: 4478
HURT: 2395
Inst-and-stalls are helped.

total nops in shared programs: 354365 -> 342202 (-3.43%)
nops in affected programs: 31000 -> 18837 (-39.24%)
helped: 4405
HURT: 265
Nops are helped.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10100>
This commit is contained in:
Iago Toral Quiroga 2021-04-06 13:53:36 +02:00 committed by Marge Bot
parent fb2214a441
commit 8998666de7
2 changed files with 258 additions and 3 deletions

View file

@ -647,6 +647,12 @@ struct v3d_compile {
*/
bool disable_tmu_pipelining;
/* Disable sorting of UBO loads with constant offset. This may
* increase the chances of being able to compile shaders with high
* register pressure.
*/
bool disable_constant_ubo_load_sorting;
/* Emits ldunif for each new uniform, even if the uniform was already
* emitted in the same block. Useful to compile shaders with high
* register pressure or to disable the optimization during uniform

View file

@ -526,6 +526,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
void *debug_output_data,
int program_id, int variant_id,
uint32_t min_threads_for_reg_alloc,
bool disable_constant_ubo_load_sorting,
bool disable_tmu_pipelining,
bool fallback_scheduler)
{
@ -543,6 +544,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
c->fallback_scheduler = fallback_scheduler;
c->disable_tmu_pipelining = disable_tmu_pipelining;
c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
s = nir_shader_clone(c, s);
c->s = s;
@ -1101,6 +1103,248 @@ should_split_wrmask(const nir_instr *instr, const void *data)
}
}
static nir_intrinsic_instr *
nir_instr_as_constant_ubo_load(nir_instr *inst)
{
if (inst->type != nir_instr_type_intrinsic)
return NULL;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
if (intr->intrinsic != nir_intrinsic_load_ubo)
return NULL;
assert(nir_src_is_const(intr->src[0]));
if (!nir_src_is_const(intr->src[1]))
return NULL;
return intr;
}
static bool
v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
{
bool progress = false;
nir_instr *ref_inst = &ref->instr;
uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
uint32_t ref_index = nir_src_as_uint(ref->src[0]);
/* Go through all instructions after ref searching for constant UBO
* loads for the same UBO index.
*/
bool seq_break = false;
nir_instr *inst = &ref->instr;
nir_instr *next_inst = NULL;
while (true) {
inst = next_inst ? next_inst : nir_instr_next(inst);
if (!inst)
break;
next_inst = NULL;
if (inst->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
if (intr->intrinsic != nir_intrinsic_load_ubo)
continue;
/* We only produce unifa sequences for non-divergent loads */
if (nir_src_is_divergent(intr->src[1]))
continue;
/* If there are any UBO loads that are not constant or that
* use a different UBO index in between the reference load and
* any other constant load for the same index, they would break
* the unifa sequence. We will flag that so we can then move
* all constant UBO loads for the reference index before these
* and not just the ones that are not ordered to avoid breaking
* the sequence and reduce unifa writes.
*/
if (!nir_src_is_const(intr->src[1])) {
seq_break = true;
continue;
}
uint32_t offset = nir_src_as_uint(intr->src[1]);
assert(nir_src_is_const(intr->src[0]));
uint32_t index = nir_src_as_uint(intr->src[0]);
if (index != ref_index) {
seq_break = true;
continue;
}
/* Only move loads with an offset that is close enough to the
* reference offset, since otherwise we would not be able to
* skip the unifa write for them. See ntq_emit_load_ubo_unifa.
*/
if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE)
continue;
/* We will move this load if its offset is smaller than ref's
* (in which case we will move it before ref) or if the offset
* is larger than ref's but there are sequence breakers in
* in between (in which case we will move it after ref and
* before the sequence breakers).
*/
if (!seq_break && offset >= ref_offset)
continue;
/* Find where exactly we want to move this load:
*
* If we are moving it before ref, we want to check any other
* UBO loads we placed before ref and make sure we insert this
* one properly ordered with them. Likewise, if we are moving
* it after ref.
*/
nir_instr *pos = ref_inst;
nir_instr *tmp = pos;
do {
if (offset < ref_offset)
tmp = nir_instr_prev(tmp);
else
tmp = nir_instr_next(tmp);
if (!tmp || tmp == inst)
break;
/* Ignore non-unifa UBO loads */
if (tmp->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *tmp_intr =
nir_instr_as_intrinsic(tmp);
if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
continue;
if (nir_src_is_divergent(tmp_intr->src[1]))
continue;
/* Stop if we find a unifa UBO load that breaks the
* sequence.
*/
if (!nir_src_is_const(tmp_intr->src[1]))
break;
if (nir_src_as_uint(tmp_intr->src[0]) != index)
break;
uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
if (offset < ref_offset) {
if (tmp_offset < offset ||
tmp_offset >= ref_offset) {
break;
} else {
pos = tmp;
}
} else {
if (tmp_offset > offset ||
tmp_offset <= ref_offset) {
break;
} else {
pos = tmp;
}
}
} while (true);
/* We can't move the UBO load before the instruction that
* defines its constant offset. If that instruction is placed
* in between the new location (pos) and the current location
* of this load, we will have to move that instruction too.
*
* We don't care about the UBO index definition because that
* is optimized to be reused by all UBO loads for the same
* index and therefore is certain to be defined before the
* first UBO load that uses it.
*/
nir_instr *offset_inst = NULL;
tmp = inst;
while ((tmp = nir_instr_prev(tmp)) != NULL) {
if (pos == tmp) {
/* We reached the target location without
* finding the instruction that defines the
* offset, so that instruction must be before
* the new position and we don't have to fix it.
*/
break;
}
if (intr->src[1].ssa->parent_instr == tmp) {
offset_inst = tmp;
break;
}
}
if (offset_inst) {
exec_node_remove(&offset_inst->node);
exec_node_insert_node_before(&pos->node,
&offset_inst->node);
}
/* Since we are moving the instruction before its current
* location, grab its successor before the move so that
* we can continue the next iteration of the main loop from
* that instruction.
*/
next_inst = nir_instr_next(inst);
/* Move this load to the selected location */
exec_node_remove(&inst->node);
if (offset < ref_offset)
exec_node_insert_node_before(&pos->node, &inst->node);
else
exec_node_insert_after(&pos->node, &inst->node);
progress = true;
}
return progress;
}
static bool
v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
nir_block *block)
{
bool progress = false;
bool local_progress;
do {
local_progress = false;
nir_foreach_instr_safe(inst, block) {
nir_intrinsic_instr *intr =
nir_instr_as_constant_ubo_load(inst);
if (intr) {
local_progress |=
v3d_nir_sort_constant_ubo_load(block, intr);
}
}
progress |= local_progress;
} while (local_progress);
return progress;
}
/**
* Sorts constant UBO loads in each block by offset to maximize chances of
* skipping unifa writes when converting to VIR. This can increase register
* pressure.
*/
static bool
v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
{
bool progress = false;
nir_foreach_function(function, s) {
if (function->impl) {
nir_foreach_block(block, function->impl) {
progress |=
v3d_nir_sort_constant_ubo_loads_block(c, block);
}
nir_metadata_preserve(function->impl,
nir_metadata_block_index |
nir_metadata_dominance);
}
}
return progress;
}
static void
v3d_attempt_compile(struct v3d_compile *c)
{
@ -1211,6 +1455,9 @@ v3d_attempt_compile(struct v3d_compile *c)
};
NIR_PASS_V(c->s, nir_schedule, &schedule_options);
if (!c->disable_constant_ubo_load_sorting)
NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c);
v3d_nir_to_vir(c);
}
@ -1284,7 +1531,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
const char *name;
uint32_t min_threads_for_reg_alloc;
} static const strategies[] = {
{ "default", 1 },
{ "default", 4 },
{ "disable UBO load sorting", 1 },
{ "disable TMU pipelining", 1 },
{ "fallback scheduler", 1 }
};
@ -1294,8 +1542,9 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
debug_output, debug_output_data,
program_id, variant_id,
strategies[i].min_threads_for_reg_alloc,
i > 0, /* Disable TMU pipelining */
i > 1 /* Fallback_scheduler */);
i > 0, /* Disable UBO load sorting */
i > 1, /* Disable TMU pipelining */
i > 2 /* Fallback_scheduler */);
v3d_attempt_compile(c);