diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 6ea0e8b5679..116273dd07a 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -647,6 +647,12 @@ struct v3d_compile { */ bool disable_tmu_pipelining; + /* Disable sorting of UBO loads with constant offset. This may + * increase the chances of being able to compile shaders with high + * register pressure. + */ + bool disable_constant_ubo_load_sorting; + /* Emits ldunif for each new uniform, even if the uniform was already * emitted in the same block. Useful to compile shaders with high * register pressure or to disable the optimization during uniform diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 941dc5b0fbc..5468364a6b0 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -526,6 +526,7 @@ vir_compile_init(const struct v3d_compiler *compiler, void *debug_output_data, int program_id, int variant_id, uint32_t min_threads_for_reg_alloc, + bool disable_constant_ubo_load_sorting, bool disable_tmu_pipelining, bool fallback_scheduler) { @@ -543,6 +544,7 @@ vir_compile_init(const struct v3d_compiler *compiler, c->min_threads_for_reg_alloc = min_threads_for_reg_alloc; c->fallback_scheduler = fallback_scheduler; c->disable_tmu_pipelining = disable_tmu_pipelining; + c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting; s = nir_shader_clone(c, s); c->s = s; @@ -1101,6 +1103,248 @@ should_split_wrmask(const nir_instr *instr, const void *data) } } +static nir_intrinsic_instr * +nir_instr_as_constant_ubo_load(nir_instr *inst) +{ + if (inst->type != nir_instr_type_intrinsic) + return NULL; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst); + if (intr->intrinsic != nir_intrinsic_load_ubo) + return NULL; + + assert(nir_src_is_const(intr->src[0])); + if (!nir_src_is_const(intr->src[1])) + return NULL; + + return intr; +} + +static bool +v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref) +{ + bool progress = false; + + nir_instr *ref_inst = &ref->instr; + uint32_t ref_offset = nir_src_as_uint(ref->src[1]); + uint32_t ref_index = nir_src_as_uint(ref->src[0]); + + /* Go through all instructions after ref searching for constant UBO + * loads for the same UBO index. + */ + bool seq_break = false; + nir_instr *inst = &ref->instr; + nir_instr *next_inst = NULL; + while (true) { + inst = next_inst ? next_inst : nir_instr_next(inst); + if (!inst) + break; + + next_inst = NULL; + + if (inst->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst); + if (intr->intrinsic != nir_intrinsic_load_ubo) + continue; + + /* We only produce unifa sequences for non-divergent loads */ + if (nir_src_is_divergent(intr->src[1])) + continue; + + /* If there are any UBO loads that are not constant or that + * use a different UBO index in between the reference load and + * any other constant load for the same index, they would break + * the unifa sequence. We will flag that so we can then move + * all constant UBO loads for the reference index before these + * and not just the ones that are not ordered to avoid breaking + * the sequence and reduce unifa writes. + */ + if (!nir_src_is_const(intr->src[1])) { + seq_break = true; + continue; + } + uint32_t offset = nir_src_as_uint(intr->src[1]); + + assert(nir_src_is_const(intr->src[0])); + uint32_t index = nir_src_as_uint(intr->src[0]); + if (index != ref_index) { + seq_break = true; + continue; + } + + /* Only move loads with an offset that is close enough to the + * reference offset, since otherwise we would not be able to + * skip the unifa write for them. See ntq_emit_load_ubo_unifa. + */ + if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE) + continue; + + /* We will move this load if its offset is smaller than ref's + * (in which case we will move it before ref) or if the offset + * is larger than ref's but there are sequence breakers in + * in between (in which case we will move it after ref and + * before the sequence breakers). + */ + if (!seq_break && offset >= ref_offset) + continue; + + /* Find where exactly we want to move this load: + * + * If we are moving it before ref, we want to check any other + * UBO loads we placed before ref and make sure we insert this + * one properly ordered with them. Likewise, if we are moving + * it after ref. + */ + nir_instr *pos = ref_inst; + nir_instr *tmp = pos; + do { + if (offset < ref_offset) + tmp = nir_instr_prev(tmp); + else + tmp = nir_instr_next(tmp); + + if (!tmp || tmp == inst) + break; + + /* Ignore non-unifa UBO loads */ + if (tmp->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *tmp_intr = + nir_instr_as_intrinsic(tmp); + if (tmp_intr->intrinsic != nir_intrinsic_load_ubo) + continue; + + if (nir_src_is_divergent(tmp_intr->src[1])) + continue; + + /* Stop if we find a unifa UBO load that breaks the + * sequence. + */ + if (!nir_src_is_const(tmp_intr->src[1])) + break; + + if (nir_src_as_uint(tmp_intr->src[0]) != index) + break; + + uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]); + if (offset < ref_offset) { + if (tmp_offset < offset || + tmp_offset >= ref_offset) { + break; + } else { + pos = tmp; + } + } else { + if (tmp_offset > offset || + tmp_offset <= ref_offset) { + break; + } else { + pos = tmp; + } + } + } while (true); + + /* We can't move the UBO load before the instruction that + * defines its constant offset. If that instruction is placed + * in between the new location (pos) and the current location + * of this load, we will have to move that instruction too. + * + * We don't care about the UBO index definition because that + * is optimized to be reused by all UBO loads for the same + * index and therefore is certain to be defined before the + * first UBO load that uses it. + */ + nir_instr *offset_inst = NULL; + tmp = inst; + while ((tmp = nir_instr_prev(tmp)) != NULL) { + if (pos == tmp) { + /* We reached the target location without + * finding the instruction that defines the + * offset, so that instruction must be before + * the new position and we don't have to fix it. + */ + break; + } + if (intr->src[1].ssa->parent_instr == tmp) { + offset_inst = tmp; + break; + } + } + + if (offset_inst) { + exec_node_remove(&offset_inst->node); + exec_node_insert_node_before(&pos->node, + &offset_inst->node); + } + + /* Since we are moving the instruction before its current + * location, grab its successor before the move so that + * we can continue the next iteration of the main loop from + * that instruction. + */ + next_inst = nir_instr_next(inst); + + /* Move this load to the selected location */ + exec_node_remove(&inst->node); + if (offset < ref_offset) + exec_node_insert_node_before(&pos->node, &inst->node); + else + exec_node_insert_after(&pos->node, &inst->node); + + progress = true; + } + + return progress; +} + +static bool +v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c, + nir_block *block) +{ + bool progress = false; + bool local_progress; + do { + local_progress = false; + nir_foreach_instr_safe(inst, block) { + nir_intrinsic_instr *intr = + nir_instr_as_constant_ubo_load(inst); + if (intr) { + local_progress |= + v3d_nir_sort_constant_ubo_load(block, intr); + } + } + progress |= local_progress; + } while (local_progress); + + return progress; +} + +/** + * Sorts constant UBO loads in each block by offset to maximize chances of + * skipping unifa writes when converting to VIR. This can increase register + * pressure. + */ +static bool +v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c) +{ + bool progress = false; + nir_foreach_function(function, s) { + if (function->impl) { + nir_foreach_block(block, function->impl) { + progress |= + v3d_nir_sort_constant_ubo_loads_block(c, block); + } + nir_metadata_preserve(function->impl, + nir_metadata_block_index | + nir_metadata_dominance); + } + } + return progress; +} + static void v3d_attempt_compile(struct v3d_compile *c) { @@ -1211,6 +1455,9 @@ v3d_attempt_compile(struct v3d_compile *c) }; NIR_PASS_V(c->s, nir_schedule, &schedule_options); + if (!c->disable_constant_ubo_load_sorting) + NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c); + v3d_nir_to_vir(c); } @@ -1284,7 +1531,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, const char *name; uint32_t min_threads_for_reg_alloc; } static const strategies[] = { - { "default", 1 }, + { "default", 4 }, + { "disable UBO load sorting", 1 }, { "disable TMU pipelining", 1 }, { "fallback scheduler", 1 } }; @@ -1294,8 +1542,9 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, debug_output, debug_output_data, program_id, variant_id, strategies[i].min_threads_for_reg_alloc, - i > 0, /* Disable TMU pipelining */ - i > 1 /* Fallback_scheduler */); + i > 0, /* Disable UBO load sorting */ + i > 1, /* Disable TMU pipelining */ + i > 2 /* Fallback_scheduler */); v3d_attempt_compile(c);