diff --git a/src/broadcom/compiler/meson.build b/src/broadcom/compiler/meson.build index 2d68fb41138..95156140ad9 100644 --- a/src/broadcom/compiler/meson.build +++ b/src/broadcom/compiler/meson.build @@ -23,6 +23,7 @@ libbroadcom_compiler_files = files( 'vir.c', 'vir_dump.c', 'vir_live_variables.c', + 'vir_opt_constant_alu.c', 'vir_opt_copy_propagate.c', 'vir_opt_dead_code.c', 'vir_opt_redundant_flags.c', diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 4bec511346a..8e83a8ed90c 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -988,6 +988,7 @@ bool vir_opt_peephole_sf(struct v3d_compile *c); bool vir_opt_redundant_flags(struct v3d_compile *c); bool vir_opt_small_immediates(struct v3d_compile *c); bool vir_opt_vpm(struct v3d_compile *c); +bool vir_opt_constant_alu(struct v3d_compile *c); void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c); void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); void v3d_nir_lower_line_smooth(nir_shader *shader); diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 3d816cbb2b5..7f915b6b910 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -1497,6 +1497,7 @@ vir_optimize(struct v3d_compile *c) OPTPASS(vir_opt_redundant_flags); OPTPASS(vir_opt_dead_code); OPTPASS(vir_opt_small_immediates); + OPTPASS(vir_opt_constant_alu); if (!progress) break; diff --git a/src/broadcom/compiler/vir_opt_constant_alu.c b/src/broadcom/compiler/vir_opt_constant_alu.c new file mode 100644 index 00000000000..561b7af6b8a --- /dev/null +++ b/src/broadcom/compiler/vir_opt_constant_alu.c @@ -0,0 +1,146 @@ +/* + * Copyright © 2021 Raspberry Pi + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file v3d_opt_constant_alu.c + * + * Identified sequences of ALU instructions that operate on constant operands + * and reduces them to a uniform load. + * + * Currently, this is useul to optimize the result of removing leading ldunifa + * instructions in the DCE pass, which can leave a series of constant additions + * that increment the unifa address by 4 for each leading ldunif removed. It + * helps turn this: + * + * nop t1; ldunif (0x00000004 / 0.000000) + * nop t2; ldunif (0x00000004 / 0.000000) + * add t3, t1, t2 + * + * into: + * + * nop t1; ldunif (0x00000004 / 0.000000) + * nop t2; ldunif (0x00000004 / 0.000000) + * nop t4; ldunif (0x00000008 / 0.000000) + * mov t3, t4 + * + * For best results we want to run copy propagation in between this and + * the combine constants pass: every time we manage to convert an alu to + * a uniform load, we move the uniform to the original alu destination. By + * running copy propagation immediately after we can reuse the uniform as + * source in more follow-up alu instructions, making them constant and allowing + * this pass to continue making progress. However, if we run the small + * immediates optimization before that, that pass can convert some of the movs + * to use small immediates instead of the uniforms and prevent us from making + * the best of this pass, as small immediates don't get copy propagated. + */ + +#include "v3d_compiler.h" + +static bool +opt_constant_add(struct v3d_compile *c, struct qinst *inst, uint32_t *values) +{ + /* FIXME: handle more add operations */ + struct qreg unif = { }; + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_ADD: + c->cursor = vir_after_inst(inst); + unif = vir_uniform_ui(c, values[0] + values[1]); + break; + default: + return false; + } + + /* Remove the original ALU instruction and replace it with a uniform + * load. + */ + struct qreg dst = inst->dst; + struct qinst *mov = vir_MOV_dest(c, dst, unif); + vir_remove_instruction(c, inst); + if (dst.file == QFILE_TEMP) + c->defs[dst.index] = mov; + return true; +} + +static bool +try_opt_constant_alu(struct v3d_compile *c, struct qinst *inst) +{ + if(inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU) + return false; + + /* If the instruction does anything other than writing the result + * directly to the destination, skip. + */ + if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE || + inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) { + return false; + } + + if (inst->qpu.flags.ac != V3D_QPU_COND_NONE || + inst->qpu.flags.mc != V3D_QPU_COND_NONE) { + return false; + } + + assert(vir_get_nsrc(inst) <= 2); + uint32_t values[2]; + for (int i = 0; i < vir_get_nsrc(inst); i++) { + if (inst->src[i].file == QFILE_SMALL_IMM && + v3d_qpu_small_imm_unpack(c->devinfo, + inst->qpu.raddr_b, + &values[i])) { + continue; + } + + if (inst->src[i].file == QFILE_TEMP) { + struct qinst *def = c->defs[inst->src[i].index]; + if (!def) + return false; + + if ((def->qpu.sig.ldunif || def->qpu.sig.ldunifrf) && + c->uniform_contents[def->uniform] == QUNIFORM_CONSTANT) { + values[i] = c->uniform_data[def->uniform]; + continue; + } + } + + return false; + } + + /* FIXME: handle mul operations */ + if (vir_is_add(inst)) + return opt_constant_add(c, inst, values); + + return false; +} + +bool +vir_opt_constant_alu(struct v3d_compile *c) +{ + bool progress = false; + vir_for_each_block(block, c) { + vir_for_each_inst_safe(inst, block) { + progress = try_opt_constant_alu(c, inst) || progress; + } + } + + return progress; +} diff --git a/src/broadcom/compiler/vir_opt_dead_code.c b/src/broadcom/compiler/vir_opt_dead_code.c index 59b458c6967..b40b25b20c0 100644 --- a/src/broadcom/compiler/vir_opt_dead_code.c +++ b/src/broadcom/compiler/vir_opt_dead_code.c @@ -160,10 +160,6 @@ increment_unifa_address(struct v3d_compile *c, struct qinst *unifa) return true; } - /* FIXME: we can optimize this further by implementing a constant - * ALU pass in the backend, for the case where we are skipping - * multiple leading ldunifa. - */ if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU && unifa->qpu.alu.add.op == V3D_QPU_A_ADD) { c->cursor = vir_after_inst(unifa);