From db25e87243ed60d4dc41df575d2229999ae3ba67 Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Fri, 5 Jun 2026 09:09:02 -0700 Subject: [PATCH] intel: Change dpas_intel source order to follow DPAS - The general NIR intrinsic (`cmat_muladd`) is `Dst A B C`, which follows SPIR-V corresponding operation. - The DPAS hardware instruction is `Dst = C B A` Right now the NIR intrinsic for `dpas_intel` has a third order `Dst = C A B`. Change so it follows DPAS hardware order. Reviewed-by: Ian Romanick Part-of: --- src/compiler/nir/nir_intrinsics.py | 17 +++++++---------- src/intel/compiler/brw/brw_from_nir.cpp | 3 +-- .../brw/brw_nir_lower_cooperative_matrix.c | 2 +- .../brw/brw_nir_opt_systolic_vectorize.c | 2 +- src/intel/compiler/jay/jay_from_nir.c | 3 +-- 5 files changed, 11 insertions(+), 16 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 5e971183fa1..cd34212a3d3 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2902,19 +2902,16 @@ system_value("btd_shader_type_intel", 1) # 64B, the pointer needs 256B aligned. system_value("ray_query_global_intel", 1, bit_sizes=[64]) -# Source 0: Accumulator matrix (type specified by DEST_TYPE) -# Source 1: A matrix (type specified by SRC_TYPE) -# Source 2: B matrix (type specified by SRC_TYPE) +# Source order same as DPAS instruction in the HW. +# +# Source 0: Accumulator matrix (type specified by DEST_BASE_TYPE) +# Source 1: B matrix (type specified by SRC_BASE_TYPE) +# Source 2: A matrix (type specified by SRC_BASE_TYPE) # # The matrix parameters are the slices owned by the invocation. # -# The accumulator is source 0 because that is the source the intrinsic -# infrastructure in NIR uses to determine the number of components in the -# result. -# -# The number of components for the second and third sources is -1 to avoid -# validation of its value. Some supported configurations will have the -# component count of that matrix different than the others. +# The number of components in the A/B sources may not match the +# destination due to different packing factors. intrinsic("dpas_intel", dest_comp=0, src_comp=[0, -1, -1], indices=[DEST_BASE_TYPE, SRC_BASE_TYPE, SATURATE, SYSTOLIC_DEPTH, REPEAT_COUNT], flags=[CAN_ELIMINATE]) diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index 6f328f1c325..7f2bff5816a 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -4255,11 +4255,10 @@ brw_from_nir_emit_cs_intrinsic(nir_to_brw_state &ntb, const unsigned dpas_exec_size = devinfo->ver >= 20 ? 16 : 8; brw_builder bldn = bld.exec_all().group(dpas_exec_size, 0); - /* DPAS uses a different source order: Accumulator, B, A. */ bldn.DPAS(retype(dest, dest_type), retype(src[0], dest_type), - retype(src[2], src_type), retype(src[1], src_type), + retype(src[2], src_type), sdepth, rcount) ->saturate = nir_intrinsic_saturate(instr); diff --git a/src/intel/compiler/brw/brw_nir_lower_cooperative_matrix.c b/src/intel/compiler/brw/brw_nir_lower_cooperative_matrix.c index 25997c1cc63..374d75d0fc3 100644 --- a/src/intel/compiler/brw/brw_nir_lower_cooperative_matrix.c +++ b/src/intel/compiler/brw/brw_nir_lower_cooperative_matrix.c @@ -861,8 +861,8 @@ lower_cmat_instr(nir_builder *b, nir_instr *instr, void *_state) nir_dpas_intel(b, dst_info->packing_factor * glsl_base_type_get_bit_size(dst_info->desc.element_type), nir_load_deref(b, accum_slice), - nir_load_deref(b, A_slice), nir_load_deref(b, B_slice), + nir_load_deref(b, A_slice), .dest_base_type = dst_type, .src_base_type = src_type, .saturate = nir_intrinsic_saturate(intrin), diff --git a/src/intel/compiler/brw/brw_nir_opt_systolic_vectorize.c b/src/intel/compiler/brw/brw_nir_opt_systolic_vectorize.c index ae446d4e311..07b0787ff1a 100644 --- a/src/intel/compiler/brw/brw_nir_opt_systolic_vectorize.c +++ b/src/intel/compiler/brw/brw_nir_opt_systolic_vectorize.c @@ -587,8 +587,8 @@ emit_dpas(const struct intel_device_info *devinfo, */ nir_def *result = nir_dpas_intel(&b, XEHP_SYSTOLIC_CHANNEL_BITS, nir_load_deref(&b, deref_d), - nir_load_deref(&b, deref_a), nir_load_deref(&b, deref_b), + nir_load_deref(&b, deref_a), .dest_base_type = GLSL_TYPE_INT, .src_base_type = GLSL_TYPE_INT8, .saturate = false, diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c index 35315741918..b97651d90b2 100644 --- a/src/intel/compiler/jay/jay_from_nir.c +++ b/src/intel/compiler/jay/jay_from_nir.c @@ -1393,8 +1393,7 @@ jay_emit_dpas(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) jay_as_gpr(b, nj_src(intr->src[2])), }; - /* Jay follows HW source order. */ - jay_DPAS(b, dst, src[0], src[2], src[1], nir_intrinsic_systolic_depth(intr), + jay_DPAS(b, dst, src[0], src[1], src[2], nir_intrinsic_systolic_depth(intr), nir_intrinsic_repeat_count(intr), jay_type_for_glsl_base_type(nir_intrinsic_dest_base_type(intr)), jay_type_for_glsl_base_type(nir_intrinsic_src_base_type(intr)),