mesa/src/intel/compiler/test_insert_load_reg.cpp
Ian Romanick 2d13acf9d9 brw: Add passes to generate and lower load_reg
v2: Add support for WE_all instructions... this already just worked, so
I only had to delete the check and the FINISHME comment.

v3: Use logic more like def_analysis::update_for_reads to determine when
to not insert LOAD_REG instructions. Based on a suggestion by Ken.

v4: Eliminate "store" from all the names since STORE_REG does not exist
anymore. Fold insert_load_reg into brw_insert_load_reg. Elminate extra
call to s.def_analysis.require() after progress. Pull a loop-invariant
check out of the inst->srouces loop. Drop call to
brw_opt_split_virtual_grfs after lowering load_reg. All suggested by
Caio.

v5: Assert that LOAD_REG doesn't already exist in
brw_insert_load_reg. Update comment before fully_defines. Both
suggested by Caio.

v6: Don't explicitly special-case SHADER_OPCODE_MEMORY_STORE_LOGICAL.
Move the inst->dst.file != VGRF check earlier to avoid the loop over
sources. Both suggested by Ken. Move the call the brw_insert_load_reg
a little bit later, and explain why it's at that location. Suggested
by Caio.

v7: Many changes to the for-each-source loop in brw_insert_load_reg.
Removes incorrect multiplication of s.alloc.sizes with reg_unit. Adds
checks for matching SIMD size and NoMask in the search for pre-existing
LOAD_REG of same value.

v8: Add some unit tests. Suggested by Caio.

shader-db:

Lunar Lake
total instructions in shared programs: 16923237 -> 16921895 (<.01%)
instructions in affected programs: 450565 -> 449223 (-0.30%)
helped: 251 / HURT: 377

total cycles in shared programs: 910428418 -> 889920590 (-2.25%)
cycles in affected programs: 719248184 -> 698740356 (-2.85%)
helped: 9076 / HURT: 9082

total fills in shared programs: 2242 -> 2218 (-1.07%)
fills in affected programs: 116 -> 92 (-20.69%)
helped: 2 / HURT: 0

total sends in shared programs: 848635 -> 848421 (-0.03%)
sends in affected programs: 810 -> 596 (-26.42%)
helped: 10 / HURT: 0

LOST:   82
GAINED: 78

Meteor Lake and DG2 had similar results. (Meteor Lake shown)
total instructions in shared programs: 19875784 -> 19871694 (-0.02%)
instructions in affected programs: 1050091 -> 1046001 (-0.39%)
helped: 251 / HURT: 2403

total cycles in shared programs: 905328238 -> 882446458 (-2.53%)
cycles in affected programs: 682736344 -> 659854564 (-3.35%)
helped: 7869 / HURT: 7911

total spills in shared programs: 5512 -> 5032 (-8.71%)
spills in affected programs: 1830 -> 1350 (-26.23%)
helped: 8 / HURT: 0

total fills in shared programs: 5648 -> 4782 (-15.33%)
fills in affected programs: 3312 -> 2446 (-26.15%)
helped: 8 / HURT: 0

total sends in shared programs: 1032942 -> 1032722 (-0.02%)
sends in affected programs: 572 -> 352 (-38.46%)
helped: 10 / HURT: 0

LOST:   138
GAINED: 53

Tiger Lake
total instructions in shared programs: 19711930 -> 19715591 (0.02%)
instructions in affected programs: 1040623 -> 1044284 (0.35%)
helped: 317 / HURT: 2474

total cycles in shared programs: 862988990 -> 860573870 (-0.28%)
cycles in affected programs: 612392461 -> 609977341 (-0.39%)
helped: 7447 / HURT: 7686

total sends in shared programs: 1034763 -> 1034555 (-0.02%)
sends in affected programs: 784 -> 576 (-26.53%)
helped: 8 / HURT: 0

LOST:   56
GAINED: 143

Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20545461 -> 20545220 (<.01%)
instructions in affected programs: 422405 -> 422164 (-0.06%)
helped: 180 / HURT: 459

total cycles in shared programs: 872697345 -> 866874523 (-0.67%)
cycles in affected programs: 573117917 -> 567295095 (-1.02%)
helped: 6783 / HURT: 6980

total spills in shared programs: 4335 -> 4336 (0.02%)
spills in affected programs: 90 -> 91 (1.11%)
helped: 1 / HURT: 2

total fills in shared programs: 4194 -> 4196 (0.05%)
fills in affected programs: 463 -> 465 (0.43%)
helped: 1 / HURT: 2

total sends in shared programs: 1079446 -> 1079238 (-0.02%)
sends in affected programs: 784 -> 576 (-26.53%)
helped: 8 / HURT: 0

LOST:   117
GAINED: 37

fossil-db:

All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 209708136 -> 209695617 (-0.01%); split: -0.02%, +0.01%
Send messages: 10927753 -> 10927640 (-0.00%)
Cycle count: 30540172048 -> 30427084732 (-0.37%); split: -0.99%, +0.62%
Spill count: 511621 -> 510932 (-0.13%); split: -0.22%, +0.08%
Fill count: 621166 -> 618440 (-0.44%); split: -0.56%, +0.12%
Scratch Memory Size: 35574784 -> 35648512 (+0.21%); split: -0.06%, +0.26%
Max live registers: 65453860 -> 65453140 (-0.00%); split: -0.00%, +0.00%
Non SSA regs after NIR: 75374990 -> 35195764 (-53.31%)

Totals from 503284 (71.25% of 706391) affected shaders:
Instrs: 180203778 -> 180191259 (-0.01%); split: -0.02%, +0.01%
Send messages: 9699732 -> 9699619 (-0.00%)
Cycle count: 30080349592 -> 29967262276 (-0.38%); split: -1.01%, +0.63%
Spill count: 511584 -> 510895 (-0.13%); split: -0.22%, +0.08%
Fill count: 621120 -> 618394 (-0.44%); split: -0.56%, +0.12%
Scratch Memory Size: 35443712 -> 35517440 (+0.21%); split: -0.06%, +0.27%
Max live registers: 52566092 -> 52565372 (-0.00%); split: -0.01%, +0.00%
Non SSA regs after NIR: 70110949 -> 29931723 (-57.31%)

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31497>
2025-04-04 06:45:02 +00:00

151 lines
4.2 KiB
C++

/*
* Copyright © 2025 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "test_helpers.h"
#include "brw_builder.h"
class insert_load_reg_test : public brw_shader_pass_test {};
TEST_F(insert_load_reg_test, basic)
{
brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_builder exp = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_reg dst = vgrf(bld, exp, BRW_TYPE_F);
brw_reg src = vgrf(bld, exp, BRW_TYPE_F);
bld.ADD(dst, src, brw_imm_f(1.0));
EXPECT_PROGRESS(brw_insert_load_reg, bld);
exp.ADD(dst, exp.LOAD_REG(src), brw_imm_f(1.0));
EXPECT_SHADERS_MATCH(bld, exp);
}
TEST_F(insert_load_reg_test, already_defs)
{
brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_reg dst0 = vgrf(bld, BRW_TYPE_F);
brw_reg dst1 = vgrf(bld, BRW_TYPE_F);
brw_reg src0 = retype(brw_vec16_reg(FIXED_GRF, 2, 0), BRW_TYPE_F);
/* The first ADD will produce a def due its FIXED_GRF and IMM sources. The
* second ADD will also produces a def due to its def and IMM
* sources. brw_insert_load_reg shouldn't do anything.
*/
bld.ADD(dst0, src0, brw_imm_f(1.0));
bld.ADD(dst1, dst0, brw_imm_f(1.0));
EXPECT_NO_PROGRESS(brw_insert_load_reg, bld);
}
TEST_F(insert_load_reg_test, stride_0)
{
brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_reg dst = vgrf(bld, BRW_TYPE_F);
brw_reg src = component(vgrf(bld, BRW_TYPE_F), 0);
ASSERT_EQ(src.stride, 0);
bld.ADD(dst, src, brw_imm_f(1.0));
EXPECT_NO_PROGRESS(brw_insert_load_reg, bld);
}
TEST_F(insert_load_reg_test, stride_2)
{
brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_reg dst = vgrf(bld, BRW_TYPE_D);
brw_reg src = subscript(vgrf(bld, BRW_TYPE_D), BRW_TYPE_W, 0);
ASSERT_EQ(src.stride, 2);
bld.ADD(dst, src, brw_imm_d(1));
EXPECT_NO_PROGRESS(brw_insert_load_reg, bld);
}
TEST_F(insert_load_reg_test, is_scalar)
{
brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_builder xbld = bld.scalar_group();
brw_reg dst = vgrf(bld, BRW_TYPE_F);
brw_reg src = vgrf(xbld, BRW_TYPE_F);
/* Currently, is_scalar cases are treated the same as other stride=0
* cases. This does not need to be the case, and it may (should!) be
* changed in the future. Split this out as a separate test.
*/
src.is_scalar = true;
bld.ADD(dst, component(src, 0), brw_imm_f(1.0));
EXPECT_NO_PROGRESS(brw_insert_load_reg, bld);
}
TEST_F(insert_load_reg_test, emit_load_reg_once)
{
brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_builder exp = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_reg dst0 = vgrf(bld, exp, BRW_TYPE_F);
brw_reg dst1 = vgrf(bld, exp, BRW_TYPE_F);
brw_reg src = vgrf(bld, exp, BRW_TYPE_F);
/* Since both instructions use the same source, only one LOAD_REG should be
* generated.
*/
bld.ADD(dst0, src, brw_imm_f(1.0));
bld.ADD(dst1, src, brw_imm_f(2.0));
EXPECT_PROGRESS(brw_insert_load_reg, bld);
brw_reg dst2 = exp.LOAD_REG(src);
exp.ADD(dst0, dst2, brw_imm_f(1.0));
exp.ADD(dst1, dst2, brw_imm_f(2.0));
EXPECT_SHADERS_MATCH(bld, exp);
}
TEST_F(insert_load_reg_test, no_mask)
{
brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_builder exp = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_reg dst0 = vgrf(bld, exp, BRW_TYPE_F);
brw_reg dst1 = vgrf(bld, exp, BRW_TYPE_F);
brw_reg src0 = vgrf(bld, exp, BRW_TYPE_F);
bld.ADD(dst0, src0, brw_imm_f(1.0));
bld.exec_all().ADD(dst1, src0, brw_imm_f(2.0));
EXPECT_PROGRESS(brw_insert_load_reg, bld);
brw_reg src1 = exp.LOAD_REG(src0);
exp.ADD(dst0, src1, brw_imm_f(1.0));
brw_reg src2 = exp.exec_all().LOAD_REG(src0);
exp.exec_all().ADD(dst1, src2, brw_imm_f(2.0));
EXPECT_SHADERS_MATCH(bld, exp);
}
TEST_F(insert_load_reg_test, odd_size)
{
brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_reg dst = vgrf(bld, BRW_TYPE_D);
brw_reg src = vgrf(bld, BRW_TYPE_D, 3, 8);
/* The register allocation size is 3 SIMD8 units. Since that is not an even
* multiple of the exec size, it would be very difficult to generate a
* correct LOAD_REG. This should be skipped.
*/
bld.ADD(dst, src, brw_imm_d(1));
EXPECT_NO_PROGRESS(brw_insert_load_reg, bld);
}