diff --git a/src/intel/compiler/brw_load_reg.cpp b/src/intel/compiler/brw_load_reg.cpp new file mode 100644 index 00000000000..bd058968c23 --- /dev/null +++ b/src/intel/compiler/brw_load_reg.cpp @@ -0,0 +1,184 @@ +/* + * Copyright © 2024 Intel Corporation + * + * SPDX-License-Identifier: MIT + */ +#include "brw_shader.h" +#include "brw_cfg.h" +#include "brw_analysis.h" +#include "brw_builder.h" + +/* Duplicated from brw_def_analysis::fully_defines. */ +static bool +fully_defines(const brw_shader &s, brw_inst *inst) +{ + return s.alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written && + !inst->is_partial_write(); +} + +bool +brw_insert_load_reg(brw_shader &s) +{ + bool progress = false; + + const brw_def_analysis &defs = s.def_analysis.require(); + + foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) { + /* These should not exist yet. */ + assert(inst->opcode != SHADER_OPCODE_LOAD_REG); + + /* These opcodes may have the right source and destination patterns to + * have their sources replaced by load_reg, but these instructions are + * special and / or wierd. They should not be modified. + */ + if (inst->opcode == SHADER_OPCODE_UNDEF || + inst->opcode == BRW_OPCODE_DPAS) { + continue; + } + + /* If the destination is non-VGRF adding load_reg instructions will not + * help. If the destination is already SSA, nothing needs to be done. + */ + if (inst->dst.file != VGRF || defs.get(inst->dst) != NULL) + continue; + + /* If there is a source that would cause def_analysis::update_for_reads + * to mark the def as invalid, adding load_reg for the sources will not + * help. + */ + if (inst->reads_accumulator_implicitly()) + continue; + + bool bad_source = false; + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == ARF && + (inst->src[i].nr == BRW_ARF_ADDRESS || + inst->src[i].nr == BRW_ARF_ACCUMULATOR || + inst->src[i].nr == BRW_ARF_FLAG)) { + bad_source = true; + break; + } + } + + if (bad_source) + continue; + + /* If the instruction does not fully define the destination, adding + * load_reg instructions will not help. + */ + if (!fully_defines(s, inst)) + continue; + + if (inst->exec_size < 8) + continue; + + assert(inst->exec_size == 8 || inst->exec_size == 16 || + inst->exec_size == 32); + + const unsigned mask = (inst->exec_size / 8) - 1; + + /* Replace any non-SSA sources with load_reg of the source. */ + const brw_builder bld = brw_builder(inst); + for (int i = 0; i < inst->sources; i++) { + /* LOAD_REG only operates on VGRF sources. If the source is not VGRF, + * skip it. + */ + if (inst->src[i].file != VGRF) + continue; + + /* The source is already a def, so don't add a LOAD_REG. */ + if (defs.get(inst->src[i]) != NULL) + continue; + + /* Cases of stride != 1 are difficult to handle correctly. For + * example, when stride is 0, the source may have been written by + * NoMask instruction that cannot be seen from here. In this case, + * emitting a non-NoMask LOAD_REG may not actually copy the value + * that the instruction is trying to read. + * + * This means that is_scalar sources in larger exec sizes are not + * handled. Since enough information is available in the source, this + * could be added later. + */ + if (inst->src[i].stride != 1) + continue; + + /* If the size of the VGRF allocation is not an even multiple of + * the SIMD size, don't emit a load_reg. This can occur for sparse + * texture loads. These will have SIMD-size values for the texture + * data and a single SIMD1 register for the residency information. + */ + if ((s.alloc.sizes[inst->src[i].nr] & mask) != 0) + continue; + + brw_reg_type t = + brw_type_with_size(BRW_TYPE_UD, + brw_type_size_bits(inst->src[i].type)); + brw_reg old_src = brw_vgrf(inst->src[i].nr, t); + brw_reg new_src; + + /* Since the sources of a LOAD_REG will likely not be defs, + * none of the existing optimizations passes will eliminate + * redundant LOAD_REG instructions. Search back though this + * block to find a LOAD_REG of the same value to avoid emitting + * too many redundant instructions. + */ + foreach_inst_in_block_reverse_starting_from(brw_inst, scan_inst, inst) { + if (scan_inst->dst.file == old_src.file && + scan_inst->dst.nr == old_src.nr) { + break; + } + + if (scan_inst->opcode == SHADER_OPCODE_LOAD_REG && + scan_inst->exec_size == inst->exec_size && + scan_inst->force_writemask_all == inst->force_writemask_all && + old_src.equals(scan_inst->src[0])) { + new_src = scan_inst->dst; + break; + } + } + + if (new_src.file == BAD_FILE) + new_src = bld.LOAD_REG(old_src); + + inst->src[i].nr = new_src.nr; + progress = true; + } + } + + if (progress) + s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | + BRW_DEPENDENCY_VARIABLES); + + return progress; +} + +bool +brw_lower_load_reg(brw_shader &s) +{ + bool progress = false; + + foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) { + if (inst->opcode == SHADER_OPCODE_LOAD_REG) { + const brw_builder ibld = brw_builder(inst); + + const unsigned bytes = inst->size_written; + const unsigned type_bytes = brw_type_size_bytes(inst->dst.type); + const unsigned bytes_per_mov = inst->exec_size * type_bytes; + + for (unsigned i = 0; i < bytes; i += bytes_per_mov) { + ibld.MOV(byte_offset(inst->dst, i), + byte_offset(inst->src[0], i)); + } + + inst->remove(); + progress = true; + } + } + + if (progress) + s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | + BRW_DEPENDENCY_VARIABLES); + + return progress; +} diff --git a/src/intel/compiler/brw_opt.cpp b/src/intel/compiler/brw_opt.cpp index 5dd4d833cd2..ad35c4f7637 100644 --- a/src/intel/compiler/brw_opt.cpp +++ b/src/intel/compiler/brw_opt.cpp @@ -53,6 +53,14 @@ brw_optimize(brw_shader &s) OPT(brw_opt_eliminate_find_live_channel); + /* Add load_reg instructions before the main optimization loop to get more + * defs available in those passes. Do it after the preceeding few pre-loop + * passes so that it hopefully has less work to do. Having it here versus + * before the call to opt_dce made some difference, but it was mostly + * noise. + */ + OPT(brw_insert_load_reg); + /* Track how much non-SSA at this point. */ { const brw_def_analysis &defs = s.def_analysis.require(); @@ -85,6 +93,12 @@ brw_optimize(brw_shader &s) if (OPT(brw_opt_combine_convergent_txf)) OPT(brw_opt_copy_propagation_defs); + if (OPT(brw_lower_load_reg)) { + OPT(brw_opt_copy_propagation); + OPT(brw_opt_register_coalesce); + OPT(brw_opt_dead_code_eliminate); + } + if (OPT(brw_lower_pack)) { OPT(brw_opt_register_coalesce); OPT(brw_opt_dead_code_eliminate); diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h index ebe55bd9dbf..d6f85638eaf 100644 --- a/src/intel/compiler/brw_shader.h +++ b/src/intel/compiler/brw_shader.h @@ -353,3 +353,6 @@ unsigned brw_get_lowered_simd_width(const brw_shader *shader, brw_reg brw_allocate_vgrf(brw_shader &s, brw_reg_type type, unsigned count); brw_reg brw_allocate_vgrf_units(brw_shader &s, unsigned units_of_REGSIZE); + +bool brw_insert_load_reg(brw_shader &s); +bool brw_lower_load_reg(brw_shader &s); diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index d2d803a2232..5a7cd723ea3 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -56,6 +56,7 @@ libintel_compiler_brw_files = files( 'brw_inst.cpp', 'brw_inst.h', 'brw_isa_info.h', + 'brw_load_reg.cpp', 'brw_lower.cpp', 'brw_lower_dpas.cpp', 'brw_lower_integer_multiplication.cpp', @@ -212,6 +213,7 @@ if with_tests 'test_eu_validate.cpp', 'test_helpers.cpp', 'test_helpers.h', + 'test_insert_load_reg.cpp', 'test_lower_scoreboard.cpp', 'test_opt_cmod_propagation.cpp', 'test_opt_combine_constants.cpp', diff --git a/src/intel/compiler/test_insert_load_reg.cpp b/src/intel/compiler/test_insert_load_reg.cpp new file mode 100644 index 00000000000..6aaa91fc878 --- /dev/null +++ b/src/intel/compiler/test_insert_load_reg.cpp @@ -0,0 +1,151 @@ +/* + * Copyright © 2025 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "test_helpers.h" +#include "brw_builder.h" + +class insert_load_reg_test : public brw_shader_pass_test {}; + +TEST_F(insert_load_reg_test, basic) +{ + brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16); + brw_builder exp = make_shader(MESA_SHADER_FRAGMENT, 16); + + brw_reg dst = vgrf(bld, exp, BRW_TYPE_F); + brw_reg src = vgrf(bld, exp, BRW_TYPE_F); + + bld.ADD(dst, src, brw_imm_f(1.0)); + + EXPECT_PROGRESS(brw_insert_load_reg, bld); + + exp.ADD(dst, exp.LOAD_REG(src), brw_imm_f(1.0)); + + EXPECT_SHADERS_MATCH(bld, exp); +} + +TEST_F(insert_load_reg_test, already_defs) +{ + brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16); + + brw_reg dst0 = vgrf(bld, BRW_TYPE_F); + brw_reg dst1 = vgrf(bld, BRW_TYPE_F); + brw_reg src0 = retype(brw_vec16_reg(FIXED_GRF, 2, 0), BRW_TYPE_F); + + /* The first ADD will produce a def due its FIXED_GRF and IMM sources. The + * second ADD will also produces a def due to its def and IMM + * sources. brw_insert_load_reg shouldn't do anything. + */ + bld.ADD(dst0, src0, brw_imm_f(1.0)); + bld.ADD(dst1, dst0, brw_imm_f(1.0)); + + EXPECT_NO_PROGRESS(brw_insert_load_reg, bld); +} + +TEST_F(insert_load_reg_test, stride_0) +{ + brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16); + + brw_reg dst = vgrf(bld, BRW_TYPE_F); + brw_reg src = component(vgrf(bld, BRW_TYPE_F), 0); + + ASSERT_EQ(src.stride, 0); + bld.ADD(dst, src, brw_imm_f(1.0)); + + EXPECT_NO_PROGRESS(brw_insert_load_reg, bld); +} + +TEST_F(insert_load_reg_test, stride_2) +{ + brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16); + + brw_reg dst = vgrf(bld, BRW_TYPE_D); + brw_reg src = subscript(vgrf(bld, BRW_TYPE_D), BRW_TYPE_W, 0); + + ASSERT_EQ(src.stride, 2); + bld.ADD(dst, src, brw_imm_d(1)); + + EXPECT_NO_PROGRESS(brw_insert_load_reg, bld); +} + +TEST_F(insert_load_reg_test, is_scalar) +{ + brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16); + brw_builder xbld = bld.scalar_group(); + + brw_reg dst = vgrf(bld, BRW_TYPE_F); + brw_reg src = vgrf(xbld, BRW_TYPE_F); + + /* Currently, is_scalar cases are treated the same as other stride=0 + * cases. This does not need to be the case, and it may (should!) be + * changed in the future. Split this out as a separate test. + */ + src.is_scalar = true; + + bld.ADD(dst, component(src, 0), brw_imm_f(1.0)); + + EXPECT_NO_PROGRESS(brw_insert_load_reg, bld); +} + +TEST_F(insert_load_reg_test, emit_load_reg_once) +{ + brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16); + brw_builder exp = make_shader(MESA_SHADER_FRAGMENT, 16); + + brw_reg dst0 = vgrf(bld, exp, BRW_TYPE_F); + brw_reg dst1 = vgrf(bld, exp, BRW_TYPE_F); + brw_reg src = vgrf(bld, exp, BRW_TYPE_F); + + /* Since both instructions use the same source, only one LOAD_REG should be + * generated. + */ + bld.ADD(dst0, src, brw_imm_f(1.0)); + bld.ADD(dst1, src, brw_imm_f(2.0)); + + EXPECT_PROGRESS(brw_insert_load_reg, bld); + + brw_reg dst2 = exp.LOAD_REG(src); + exp.ADD(dst0, dst2, brw_imm_f(1.0)); + exp.ADD(dst1, dst2, brw_imm_f(2.0)); + + EXPECT_SHADERS_MATCH(bld, exp); +} + +TEST_F(insert_load_reg_test, no_mask) +{ + brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16); + brw_builder exp = make_shader(MESA_SHADER_FRAGMENT, 16); + + brw_reg dst0 = vgrf(bld, exp, BRW_TYPE_F); + brw_reg dst1 = vgrf(bld, exp, BRW_TYPE_F); + brw_reg src0 = vgrf(bld, exp, BRW_TYPE_F); + + bld.ADD(dst0, src0, brw_imm_f(1.0)); + bld.exec_all().ADD(dst1, src0, brw_imm_f(2.0)); + + EXPECT_PROGRESS(brw_insert_load_reg, bld); + + brw_reg src1 = exp.LOAD_REG(src0); + exp.ADD(dst0, src1, brw_imm_f(1.0)); + brw_reg src2 = exp.exec_all().LOAD_REG(src0); + exp.exec_all().ADD(dst1, src2, brw_imm_f(2.0)); + + EXPECT_SHADERS_MATCH(bld, exp); +} + +TEST_F(insert_load_reg_test, odd_size) +{ + brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16); + + brw_reg dst = vgrf(bld, BRW_TYPE_D); + brw_reg src = vgrf(bld, BRW_TYPE_D, 3, 8); + + /* The register allocation size is 3 SIMD8 units. Since that is not an even + * multiple of the exec size, it would be very difficult to generate a + * correct LOAD_REG. This should be skipped. + */ + bld.ADD(dst, src, brw_imm_d(1)); + + EXPECT_NO_PROGRESS(brw_insert_load_reg, bld); +}