From 3362b8dcb59d7679f44f9f53a865035c430bc41f Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Sat, 23 Aug 2025 15:48:06 +0300 Subject: [PATCH] brw: use a scalar builder for the load_payload on transpose loads I noticed SIMD32 shaders have that kind of pattern : mov(32) g94<1>D 0D { align1 WE_all }; send(1) g15UD g94UD nullUD 0x6210d500 0x02010000 ugm MsgDesc: ( load, a32, d32, V16, transpose, L1STATE_L3MOCS dst_len = 1, src0_len = 1, src1_len = 0 bti ) BTI 2 base_offset 16 { align1 WE_all 1N I@5 $1 }; Why use a 32 wide register for a SEND that is only going to read the first lane? We can stick a single physical register and reduce register pressure. DG2 fossils-db results : Totals: Instrs: 157417515 -> 157417796 (+0.00%); split: -0.00%, +0.00% Cycle count: 15362185116 -> 15363086774 (+0.01%); split: -0.05%, +0.05% Max live registers: 29059141 -> 29051166 (-0.03%) Max dispatch width: 5071256 -> 5075720 (+0.09%); split: +0.33%, -0.24% Totals from 82132 (14.43% of 569221) affected shaders: Instrs: 26564632 -> 26564913 (+0.00%); split: -0.00%, +0.00% Cycle count: 4630907475 -> 4631809133 (+0.02%); split: -0.16%, +0.18% Max live registers: 5425037 -> 5417062 (-0.15%) Max dispatch width: 128384 -> 132848 (+3.48%); split: +12.92%, -9.45% LNL fossils-db results : Totals: Instrs: 141870413 -> 141870745 (+0.00%); split: -0.00%, +0.00% Cycle count: 20176018818 -> 20191262632 (+0.08%); split: -0.07%, +0.14% Max live registers: 44858167 -> 44838370 (-0.04%) Totals from 51859 (10.55% of 491590) affected shaders: Instrs: 16834547 -> 16834879 (+0.00%); split: -0.00%, +0.00% Cycle count: 5761980106 -> 5777223920 (+0.26%); split: -0.24%, +0.50% Max live registers: 5893878 -> 5874081 (-0.34%) Perf A/B testing only reported a 0.5% improvement on DG2 on one trace, no changes on BMG. Signed-off-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_lower_logical_sends.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index 0d0bf7f9313..891a3d03fb5 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -1569,7 +1569,10 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_inst *inst) if (addr.file != VGRF || !addr.is_contiguous()) { if (inst->force_writemask_all) { - const brw_builder dbld = bld.group(bld.shader->dispatch_width, 0); + const brw_builder dbld = + inst->exec_size == 1 ? + bld.scalar_group() : + bld.group(bld.shader->dispatch_width, 0); payload = dbld.move_to_vgrf(addr, coord_components); } else { payload = bld.move_to_vgrf(addr, coord_components);