From 55f193a1059bb07ac265613ced48c7d462bfc43f Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Mon, 5 Aug 2024 20:54:55 -0700 Subject: [PATCH] intel/brw: Switch from LSC CMASK opcodes to regular LOAD/STORE The LOAD/STORE opcodes take a vector size (number of components), while the LOAD/STORE_CMASK opcodes take a channel mask. For some reason, we were passing a number of channels to lsc_msg_desc(), then using it to construct a channel mask with all channels enabled, and always using the CMASK message variants. Considering we don't actually want to mask off any channels, we should probably just use the regular LOAD/STORE opcodes, as they're more flexible anyway. One exception is that typed messages on Xe2 apparently only support LOAD_CMASK/STORE_CMASK and not regular LOAD/STORE. So we keep using those there. (Thanks to Sagar Ghuge for catching this!) Reviewed-by: Lionel Landwerlin Reviewed-by: Sagar Ghuge Part-of: --- src/intel/compiler/brw_eu.h | 9 +------ .../compiler/brw_lower_logical_sends.cpp | 26 +++++++++++++++---- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index 99b54814a9a..baa4870b3d6 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -573,13 +573,6 @@ brw_mdc_cmask(unsigned num_channels) return 0xf & (0xf << num_channels); } -static inline unsigned -lsc_cmask(unsigned num_channels) -{ - assert(num_channels > 0 && num_channels <= 4); - return BITSET_MASK(num_channels); -} - static inline uint32_t brw_dp_untyped_surface_rw_desc(const struct intel_device_info *devinfo, unsigned exec_size, /**< 0 for SIMD4x2 */ @@ -1163,7 +1156,7 @@ lsc_msg_desc_wcmask(const struct intel_device_info *devinfo, SET_BITS(addr_type, 30, 29); if (lsc_opcode_has_cmask(opcode)) - msg_desc |= SET_BITS(cmask ? cmask : lsc_cmask(num_channels), 15, 12); + msg_desc |= SET_BITS(cmask, 15, 12); else msg_desc |= SET_BITS(lsc_vect_size(num_channels), 14, 12); diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index 380a6ab6130..301f408a442 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -1742,18 +1742,34 @@ lower_lsc_surface_logical_send(bblock_t *block, const fs_builder &bld, switch (inst->opcode) { case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: + num_components = arg.ud; + inst->desc = lsc_msg_desc_wcmask(devinfo, LSC_OP_LOAD_CMASK, + surf_type, LSC_ADDR_SIZE_A32, + LSC_DATA_SIZE_D32, num_components, + false /* transpose */, + LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS), + BITSET_MASK(num_components)); + break; case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: num_components = arg.ud; - inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, + inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, surf_type, LSC_ADDR_SIZE_A32, LSC_DATA_SIZE_D32, num_components, false /* transpose */, LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS)); break; case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: + num_components = arg.ud; + inst->desc = lsc_msg_desc_wcmask(devinfo, LSC_OP_STORE_CMASK, + surf_type, LSC_ADDR_SIZE_A32, + LSC_DATA_SIZE_D32, num_components, + false /* transpose */, + LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS), + BITSET_MASK(num_components)); + break; case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: num_components = arg.ud; - inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, + inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, surf_type, LSC_ADDR_SIZE_A32, LSC_DATA_SIZE_D32, num_components, false /* transpose */, @@ -2070,7 +2086,7 @@ lower_lsc_a64_logical_send(bblock_t *block, const fs_builder &bld, fs_inst *inst switch (inst->opcode) { case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: num_components = arg; - inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, + inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, LSC_DATA_SIZE_D32, num_components, false /* transpose */, @@ -2078,7 +2094,7 @@ lower_lsc_a64_logical_send(bblock_t *block, const fs_builder &bld, fs_inst *inst break; case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: num_components = arg; - inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, + inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64, LSC_DATA_SIZE_D32, num_components, false /* transpose */, @@ -2342,7 +2358,7 @@ lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld, if (alignment >= 4) { inst->desc = - lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, + lsc_msg_desc(devinfo, LSC_OP_LOAD, surf_type, LSC_ADDR_SIZE_A32, LSC_DATA_SIZE_D32, 4 /* num_channels */,