intel/brw: Switch from LSC CMASK opcodes to regular LOAD/STORE

The LOAD/STORE opcodes take a vector size (number of components), while
the LOAD/STORE_CMASK opcodes take a channel mask.  For some reason, we
were passing a number of channels to lsc_msg_desc(), then using it to
construct a channel mask with all channels enabled, and always using the
CMASK message variants.

Considering we don't actually want to mask off any channels, we should
probably just use the regular LOAD/STORE opcodes, as they're more
flexible anyway.

One exception is that typed messages on Xe2 apparently only support
LOAD_CMASK/STORE_CMASK and not regular LOAD/STORE.  So we keep using
those there.  (Thanks to Sagar Ghuge for catching this!)

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30632>
This commit is contained in:
Kenneth Graunke 2024-08-05 20:54:55 -07:00 committed by Marge Bot
parent 7e52b67801
commit 55f193a105
2 changed files with 22 additions and 13 deletions

View file

@ -573,13 +573,6 @@ brw_mdc_cmask(unsigned num_channels)
return 0xf & (0xf << num_channels);
}
static inline unsigned
lsc_cmask(unsigned num_channels)
{
assert(num_channels > 0 && num_channels <= 4);
return BITSET_MASK(num_channels);
}
static inline uint32_t
brw_dp_untyped_surface_rw_desc(const struct intel_device_info *devinfo,
unsigned exec_size, /**< 0 for SIMD4x2 */
@ -1163,7 +1156,7 @@ lsc_msg_desc_wcmask(const struct intel_device_info *devinfo,
SET_BITS(addr_type, 30, 29);
if (lsc_opcode_has_cmask(opcode))
msg_desc |= SET_BITS(cmask ? cmask : lsc_cmask(num_channels), 15, 12);
msg_desc |= SET_BITS(cmask, 15, 12);
else
msg_desc |= SET_BITS(lsc_vect_size(num_channels), 14, 12);

View file

@ -1742,18 +1742,34 @@ lower_lsc_surface_logical_send(bblock_t *block, const fs_builder &bld,
switch (inst->opcode) {
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
num_components = arg.ud;
inst->desc = lsc_msg_desc_wcmask(devinfo, LSC_OP_LOAD_CMASK,
surf_type, LSC_ADDR_SIZE_A32,
LSC_DATA_SIZE_D32, num_components,
false /* transpose */,
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
BITSET_MASK(num_components));
break;
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
num_components = arg.ud;
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK,
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
surf_type, LSC_ADDR_SIZE_A32,
LSC_DATA_SIZE_D32, num_components,
false /* transpose */,
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
break;
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
num_components = arg.ud;
inst->desc = lsc_msg_desc_wcmask(devinfo, LSC_OP_STORE_CMASK,
surf_type, LSC_ADDR_SIZE_A32,
LSC_DATA_SIZE_D32, num_components,
false /* transpose */,
LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
BITSET_MASK(num_components));
break;
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
num_components = arg.ud;
inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK,
inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
surf_type, LSC_ADDR_SIZE_A32,
LSC_DATA_SIZE_D32, num_components,
false /* transpose */,
@ -2070,7 +2086,7 @@ lower_lsc_a64_logical_send(bblock_t *block, const fs_builder &bld, fs_inst *inst
switch (inst->opcode) {
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
num_components = arg;
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK,
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
LSC_DATA_SIZE_D32, num_components,
false /* transpose */,
@ -2078,7 +2094,7 @@ lower_lsc_a64_logical_send(bblock_t *block, const fs_builder &bld, fs_inst *inst
break;
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
num_components = arg;
inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK,
inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
LSC_DATA_SIZE_D32, num_components,
false /* transpose */,
@ -2342,7 +2358,7 @@ lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
if (alignment >= 4) {
inst->desc =
lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK,
lsc_msg_desc(devinfo, LSC_OP_LOAD,
surf_type, LSC_ADDR_SIZE_A32,
LSC_DATA_SIZE_D32,
4 /* num_channels */,