From cb756ae8a24b72e83c9591370e34e03cba6cbd12 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Wed, 18 Dec 2024 22:41:52 -0800 Subject: [PATCH] brw: Don't rely on SIMD splitting in opt_combine_convergent_txfs The SIMD splitting pass does not handle wide force_writemask_all instructions correctly at the moment. For example, a SIMD32 TXF on pre-Xe2 would get split to a pair of SIMD16. But it will set the groups to operate on channels 15:0 and 31:16. That's not what we want for a NoMask instruction - both should be 15:0, i.e. bld.group(inst->exec_size, 0). We could (and perhaps should) fix the SIMD splitting pass to handle this, but the pass already has subtle complexity in which builders are used. Or we could alter fs_builder::group(), but that has broader implications. As a stop-gap, just make opt_combine_covergent_txfs stop relying on SIMD splitting. It's trivial to do and fixes the issue without risking other breakage. Fixes: 6341b3cd87d9 ("brw: Combine convergent texture buffer fetches into fewer loads") Reviewed-by: Ian Romanick Part-of: --- src/intel/compiler/brw_opt_txf_combiner.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_opt_txf_combiner.cpp b/src/intel/compiler/brw_opt_txf_combiner.cpp index b4d912b0257..17f65fe89ef 100644 --- a/src/intel/compiler/brw_opt_txf_combiner.cpp +++ b/src/intel/compiler/brw_opt_txf_combiner.cpp @@ -85,6 +85,7 @@ brw_opt_combine_convergent_txf(fs_visitor &s) const def_analysis &defs = s.def_analysis.require(); const unsigned min_simd = 8 * reg_unit(s.devinfo); + const unsigned max_simd = 16 * reg_unit(s.devinfo); const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo); bool progress = false; @@ -148,8 +149,8 @@ brw_opt_combine_convergent_txf(fs_visitor &s) continue; /* Emit divergent TXFs and replace the original ones with MOVs */ - for (unsigned curr = 0; curr < count; curr += 32) { - const unsigned lanes = CLAMP(count - curr, min_simd, 32); + for (unsigned curr = 0; curr < count; curr += max_simd) { + const unsigned lanes = CLAMP(count - curr, min_simd, max_simd); const unsigned width = util_next_power_of_two(lanes); const fs_builder ubld = fs_builder(&s).at(block, txfs[curr]).exec_all().group(width, 0);