brw: serialize messages on Gfx12.x if required

The Intel EU fusion feature needs to be disabled on SEND messages
where either the texture handle, sampler handle, sampler header is not
identical on fused threads.

This is the case in particular with accesses on non-uniform
texture/sampler handles but could also strike with dynamic
programmable offsets (currently disabled).

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Anne Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37394>
This commit is contained in:
Lionel Landwerlin 2025-09-16 09:49:57 +03:00 committed by Marge Bot
parent 301b71a19f
commit 37a9c5411f
10 changed files with 175 additions and 9 deletions

View file

@ -206,6 +206,11 @@ static const char *const branch_ctrl[2] = {
[1] = "BranchCtrl" [1] = "BranchCtrl"
}; };
static const char *const fusion_ctrl[2] = {
[0] = "",
[1] = "FusionCtrl"
};
static const char *const wectrl[2] = { static const char *const wectrl[2] = {
[0] = "", [0] = "",
[1] = "WE_all" [1] = "WE_all"
@ -2619,6 +2624,12 @@ brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa,
err |= control(file, "acc write control", accwr, err |= control(file, "acc write control", accwr,
brw_eu_inst_acc_wr_control(devinfo, inst), &space); brw_eu_inst_acc_wr_control(devinfo, inst), &space);
} }
if (devinfo->ver == 12 && is_send(opcode)) {
err |= control(file, "fusion ctrl", fusion_ctrl,
brw_eu_inst_fusion_ctrl(devinfo, inst), &space);
}
if (is_send(opcode)) if (is_send(opcode))
err |= control(file, "end of thread", end_of_thread, err |= control(file, "end of thread", end_of_thread,
brw_eu_inst_eot(devinfo, inst), &space); brw_eu_inst_eot(devinfo, inst), &space);

View file

@ -712,6 +712,10 @@ enum memory_flags {
MEMORY_FLAG_VOLATILE_ACCESS = 1 << 2, MEMORY_FLAG_VOLATILE_ACCESS = 1 << 2,
/** Whether memory access is marked coherent by GLSL/SPIR-V. */ /** Whether memory access is marked coherent by GLSL/SPIR-V. */
MEMORY_FLAG_COHERENT_ACCESS = 1 << 3, MEMORY_FLAG_COHERENT_ACCESS = 1 << 3,
/** Whether this instruction should run serialized with regard to EU
* fusion (Gfx12.x only).
*/
MEMORY_FLAG_FUSED_EU_DISABLE = 1 << 4,
}; };
enum rt_logical_srcs { enum rt_logical_srcs {

View file

@ -897,6 +897,7 @@ brw_eu_inst_sends_ex_desc(const struct intel_device_info *devinfo,
* @{ * @{
*/ */
F(eot, /* 9+ */ 127, 127, /* 12+ */ 34, 34) F(eot, /* 9+ */ 127, 127, /* 12+ */ 34, 34)
F(fusion_ctrl, /* 9+ */ -1, -1, /* 12+ */ 33, 33)
F(mlen, /* 9+ */ 124, 121, /* 12+ */ MD12(28), MD12(25)) F(mlen, /* 9+ */ 124, 121, /* 12+ */ MD12(28), MD12(25))
F(rlen, /* 9+ */ 120, 116, /* 12+ */ MD12(24), MD12(20)) F(rlen, /* 9+ */ 120, 116, /* 12+ */ MD12(24), MD12(20))
F(header_present, /* 9+ */ 115, 115, /* 12+ */ MD12(19), MD12(19)) F(header_present, /* 9+ */ 115, 115, /* 12+ */ MD12(19), MD12(19))

View file

@ -6471,6 +6471,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
brw_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload, brw_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
srcs, GET_BUFFER_SIZE_SRCS); srcs, GET_BUFFER_SIZE_SRCS);
inst->size_written = 4 * REG_SIZE * reg_unit(devinfo); inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
inst->fused_eu_disable =
(nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL) != 0;
/* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting: /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
* *
@ -7016,12 +7018,15 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
(nir_intrinsic_access(instr) & ACCESS_VOLATILE); (nir_intrinsic_access(instr) & ACCESS_VOLATILE);
const bool coherent_access = nir_intrinsic_has_access(instr) && const bool coherent_access = nir_intrinsic_has_access(instr) &&
(nir_intrinsic_access(instr) & ACCESS_COHERENT); (nir_intrinsic_access(instr) & ACCESS_COHERENT);
const bool fused_eu_disable = nir_intrinsic_has_access(instr) &&
(nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL);
const unsigned align = const unsigned align =
nir_intrinsic_has_align(instr) ? nir_intrinsic_align(instr) : 0; nir_intrinsic_has_align(instr) ? nir_intrinsic_align(instr) : 0;
uint8_t flags = uint8_t flags =
(include_helpers ? MEMORY_FLAG_INCLUDE_HELPERS : 0) | (include_helpers ? MEMORY_FLAG_INCLUDE_HELPERS : 0) |
(volatile_access ? MEMORY_FLAG_VOLATILE_ACCESS : 0) | (volatile_access ? MEMORY_FLAG_VOLATILE_ACCESS : 0) |
(coherent_access ? MEMORY_FLAG_COHERENT_ACCESS : 0); (coherent_access ? MEMORY_FLAG_COHERENT_ACCESS : 0) |
(fused_eu_disable ? MEMORY_FLAG_FUSED_EU_DISABLE : 0);
bool no_mask_handle = false; bool no_mask_handle = false;
int data_src = -1; int data_src = -1;
@ -7661,6 +7666,7 @@ brw_from_nir_emit_texture(nir_to_brw_state &ntb,
tex->residency = instr->is_sparse; tex->residency = instr->is_sparse;
tex->coord_components = instr->coord_components; tex->coord_components = instr->coord_components;
tex->grad_components = lod_components; tex->grad_components = lod_components;
tex->fused_eu_disable = (instr->backend_flags & BRW_TEX_INSTR_FUSED_EU_DISABLE) != 0;
/* Wa_14012688258: /* Wa_14012688258:
* *

View file

@ -198,6 +198,10 @@ brw_generator::generate_send(brw_send_inst *inst,
brw_eu_inst_set_opcode(p->isa, brw_last_inst, brw_eu_inst_set_opcode(p->isa, brw_last_inst,
devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
} }
/* Serialize messages if needed */
if (devinfo->ver == 12 && inst->fused_eu_disable)
brw_eu_inst_set_fusion_ctrl(devinfo, brw_last_inst, true);
} }
void void

View file

@ -213,7 +213,12 @@ struct brw_inst : brw_exec_node {
*/ */
bool has_no_mask_send_params:1; bool has_no_mask_send_params:1;
uint8_t pad:5; /**
* Serialize the message (Gfx12.x only)
*/
bool fused_eu_disable:1;
uint8_t pad:4;
}; };
uint16_t bits; uint16_t bits;
}; };
@ -261,6 +266,11 @@ struct brw_send_inst : brw_inst {
*/ */
bool ex_bso:1; bool ex_bso:1;
/**
* Serialize the message (Gfx12.x only)
*/
bool fused_eu_disable:1;
/** /**
* Only for SHADER_OPCODE_SEND, @offset field contains an immediate * Only for SHADER_OPCODE_SEND, @offset field contains an immediate
* part of the extended descriptor that must be encoded in the * part of the extended descriptor that must be encoded in the
@ -268,7 +278,7 @@ struct brw_send_inst : brw_inst {
*/ */
bool ex_desc_imm:1; bool ex_desc_imm:1;
uint8_t pad:3; uint8_t pad:2;
}; };
uint8_t send_bits; uint8_t send_bits;
}; };
@ -279,10 +289,29 @@ struct brw_tex_inst : brw_inst {
uint32_t offset; uint32_t offset;
uint8_t coord_components; uint8_t coord_components;
uint8_t grad_components; uint8_t grad_components;
union {
struct {
/**
* Whether the instruction requests the residency data (additional register
* written).
*/
bool residency:1; bool residency:1;
/**
* Serialize the message (Gfx12.x only)
*/
bool fused_eu_disable:1;
/**
* Whether the surface handle is bindless
*/
bool surface_bindless:1; bool surface_bindless:1;
/**
* Whether the sampler handle is bindless
*/
bool sampler_bindless:1; bool sampler_bindless:1;
}; };
uint8_t bits;
};
};
struct brw_mem_inst : brw_inst { struct brw_mem_inst : brw_inst {
enum lsc_opcode lsc_op; enum lsc_opcode lsc_op;

View file

@ -1217,9 +1217,12 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
} }
} }
const bool fused_eu_disable = tex->fused_eu_disable;
brw_send_inst *send = brw_transform_inst_to_send(bld, tex); brw_send_inst *send = brw_transform_inst_to_send(bld, tex);
tex = NULL; tex = NULL;
send->fused_eu_disable = fused_eu_disable;
send->mlen = mlen; send->mlen = mlen;
send->header_size = header_size; send->header_size = header_size;
send->sfid = BRW_SFID_SAMPLER; send->sfid = BRW_SFID_SAMPLER;
@ -1481,6 +1484,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS; const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS;
const bool coherent_access = mem->flags & MEMORY_FLAG_COHERENT_ACCESS; const bool coherent_access = mem->flags & MEMORY_FLAG_COHERENT_ACCESS;
const bool has_side_effects = mem->has_side_effects(); const bool has_side_effects = mem->has_side_effects();
const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE;
const uint32_t data_size_B = lsc_data_size_bytes(data_size); const uint32_t data_size_B = lsc_data_size_bytes(data_size);
const enum brw_reg_type data_type = const enum brw_reg_type data_type =
@ -1634,6 +1638,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
send->header_size = 0; send->header_size = 0;
send->has_side_effects = has_side_effects; send->has_side_effects = has_side_effects;
send->is_volatile = !has_side_effects || volatile_access; send->is_volatile = !has_side_effects || volatile_access;
send->fused_eu_disable = fused_eu_disable;
/* Finally, the payload */ /* Finally, the payload */
send->src[SEND_SRC_PAYLOAD1] = payload; send->src[SEND_SRC_PAYLOAD1] = payload;
@ -1692,6 +1697,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
const bool block = mem->flags & MEMORY_FLAG_TRANSPOSE; const bool block = mem->flags & MEMORY_FLAG_TRANSPOSE;
const bool include_helpers = mem->flags & MEMORY_FLAG_INCLUDE_HELPERS; const bool include_helpers = mem->flags & MEMORY_FLAG_INCLUDE_HELPERS;
const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS; const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS;
const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE;
const bool has_side_effects = mem->has_side_effects(); const bool has_side_effects = mem->has_side_effects();
const bool has_dest = mem->dst.file != BAD_FILE && !mem->dst.is_null(); const bool has_dest = mem->dst.file != BAD_FILE && !mem->dst.is_null();
assert(mem->address_offset == 0); assert(mem->address_offset == 0);
@ -1903,6 +1909,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem)
send->header_size = header.file != BAD_FILE ? 1 : 0; send->header_size = header.file != BAD_FILE ? 1 : 0;
send->has_side_effects = has_side_effects; send->has_side_effects = has_side_effects;
send->is_volatile = !has_side_effects || volatile_access; send->is_volatile = !has_side_effects || volatile_access;
send->fused_eu_disable = fused_eu_disable;
if (block) { if (block) {
assert(send->force_writemask_all); assert(send->force_writemask_all);
@ -2447,6 +2454,7 @@ lower_get_buffer_size(const brw_builder &bld, brw_inst *inst)
brw_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE]; brw_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
brw_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE]; brw_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
brw_reg lod = bld.move_to_vgrf(inst->src[GET_BUFFER_SIZE_SRC_LOD], 1); brw_reg lod = bld.move_to_vgrf(inst->src[GET_BUFFER_SIZE_SRC_LOD], 1);
const bool fused_eu_disable = inst->fused_eu_disable;
brw_send_inst *send = brw_transform_inst_to_send(bld, inst); brw_send_inst *send = brw_transform_inst_to_send(bld, inst);
inst = NULL; inst = NULL;
@ -2468,6 +2476,7 @@ lower_get_buffer_size(const brw_builder &bld, brw_inst *inst)
send->dst = retype(send->dst, BRW_TYPE_UW); send->dst = retype(send->dst, BRW_TYPE_UW);
send->sfid = BRW_SFID_SAMPLER; send->sfid = BRW_SFID_SAMPLER;
send->fused_eu_disable = fused_eu_disable;
setup_surface_descriptors(bld, send, desc, surface, surface_handle); setup_surface_descriptors(bld, send, desc, surface, surface_handle);
} }

View file

@ -2065,6 +2065,86 @@ lower_txd_cb(const nir_tex_instr *tex, const void *data)
return false; return false;
} }
static bool
flag_fused_eu_disable_instr(nir_builder *b, nir_instr *instr, void *data)
{
switch (instr->type) {
case nir_instr_type_tex: {
nir_tex_instr *tex = nir_instr_as_tex(instr);
for (unsigned i = 0; i < tex->num_srcs; ++i) {
nir_tex_src_type src_type = tex->src[i].src_type;
if (src_type != nir_tex_src_texture_handle &&
src_type != nir_tex_src_sampler_handle &&
src_type != nir_tex_src_texture_offset &&
src_type != nir_tex_src_sampler_offset)
continue;
if (nir_src_is_divergent(&tex->src[i].src)) {
tex->backend_flags |= BRW_TEX_INSTR_FUSED_EU_DISABLE;
return true;
}
}
return false;
}
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
/* We only need to care of intrinsics that refers to a structure/descriptor
* outside of the EU's registers like RENDER_SURFACE_STATE/SAMPLER_STATE,
* because the fusing will pick one thread's descriptor handle and use that
* for the 2 fused threads.
*
* Global pointers don't have that problem since all the access' data is
* per lane in the payload of the SEND message (the 64bit pointer).
*
* URB/shared-memory don't have that problem either because there is no
* descriptor information outside the EU, it's just a per lane
* handle/offset.
*/
switch (intrin->intrinsic) {
case nir_intrinsic_load_ssbo_uniform_block_intel:
case nir_intrinsic_load_ubo_uniform_block_intel:
case nir_intrinsic_load_ssbo_block_intel:
case nir_intrinsic_load_ssbo_intel:
case nir_intrinsic_store_ssbo_intel:
case nir_intrinsic_load_ssbo:
case nir_intrinsic_store_ssbo:
case nir_intrinsic_get_ssbo_size:
case nir_intrinsic_load_ubo:
case nir_intrinsic_image_load:
case nir_intrinsic_image_store:
case nir_intrinsic_image_size:
case nir_intrinsic_image_levels:
case nir_intrinsic_image_atomic:
case nir_intrinsic_image_atomic_swap:
case nir_intrinsic_bindless_image_load:
case nir_intrinsic_bindless_image_store:
case nir_intrinsic_bindless_image_size:
case nir_intrinsic_bindless_image_levels:
case nir_intrinsic_bindless_image_atomic:
case nir_intrinsic_bindless_image_atomic_swap: {
int src_idx = nir_get_io_index_src_number(intrin);
if (nir_src_is_divergent(&intrin->src[src_idx])) {
nir_intrinsic_set_access(intrin,
nir_intrinsic_access(intrin) |
ACCESS_FUSED_EU_DISABLE_INTEL);
return true;
}
return false;
}
default:
return false;
}
}
default:
return false;
}
}
/* Prepare the given shader for codegen /* Prepare the given shader for codegen
* *
* This function is intended to be called right before going into the actual * This function is intended to be called right before going into the actual
@ -2283,6 +2363,28 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler,
OPT(nir_lower_subgroups, &subgroups_options); OPT(nir_lower_subgroups, &subgroups_options);
} }
/* Deal with EU fusion */
if (devinfo->ver == 12) {
nir_divergence_options options =
nir_divergence_across_subgroups |
nir_divergence_multiple_workgroup_per_compute_subgroup;
nir_foreach_function_impl(impl, nir) {
nir_divergence_analysis_impl(impl, options);
}
nir_shader_instructions_pass(nir,
flag_fused_eu_disable_instr,
nir_metadata_all, NULL);
/* We request a special divergence information which is not needed
* after.
*/
nir_foreach_function_impl(impl, nir) {
nir_progress(true, impl, ~nir_metadata_divergence);
}
}
} }
void void

View file

@ -32,6 +32,8 @@
extern "C" { extern "C" {
#endif #endif
#define BRW_TEX_INSTR_FUSED_EU_DISABLE (1u << 30)
extern const struct nir_shader_compiler_options brw_scalar_nir_options; extern const struct nir_shader_compiler_options brw_scalar_nir_options;
int type_size_vec4(const struct glsl_type *type, bool bindless); int type_size_vec4(const struct glsl_type *type, bool bindless);

View file

@ -391,9 +391,7 @@ hash_inst(const void *v)
const uint8_t tex_u8data[] = { const uint8_t tex_u8data[] = {
tex->coord_components, tex->coord_components,
tex->grad_components, tex->grad_components,
tex->residency, tex->bits,
tex->surface_bindless,
tex->sampler_bindless,
}; };
const uint32_t tex_u32data[] = { const uint32_t tex_u32data[] = {
tex->sampler_opcode, tex->sampler_opcode,