diff --git a/src/intel/compiler/brw/brw_disasm.c b/src/intel/compiler/brw/brw_disasm.c index 43636930706..e0709bec5c0 100644 --- a/src/intel/compiler/brw/brw_disasm.c +++ b/src/intel/compiler/brw/brw_disasm.c @@ -206,6 +206,11 @@ static const char *const branch_ctrl[2] = { [1] = "BranchCtrl" }; +static const char *const fusion_ctrl[2] = { + [0] = "", + [1] = "FusionCtrl" +}; + static const char *const wectrl[2] = { [0] = "", [1] = "WE_all" @@ -2619,6 +2624,12 @@ brw_disassemble_inst(FILE *file, const struct brw_isa_info *isa, err |= control(file, "acc write control", accwr, brw_eu_inst_acc_wr_control(devinfo, inst), &space); } + + if (devinfo->ver == 12 && is_send(opcode)) { + err |= control(file, "fusion ctrl", fusion_ctrl, + brw_eu_inst_fusion_ctrl(devinfo, inst), &space); + } + if (is_send(opcode)) err |= control(file, "end of thread", end_of_thread, brw_eu_inst_eot(devinfo, inst), &space); diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h index 781239687a0..e40f2339e6b 100644 --- a/src/intel/compiler/brw/brw_eu_defines.h +++ b/src/intel/compiler/brw/brw_eu_defines.h @@ -712,6 +712,10 @@ enum memory_flags { MEMORY_FLAG_VOLATILE_ACCESS = 1 << 2, /** Whether memory access is marked coherent by GLSL/SPIR-V. */ MEMORY_FLAG_COHERENT_ACCESS = 1 << 3, + /** Whether this instruction should run serialized with regard to EU + * fusion (Gfx12.x only). + */ + MEMORY_FLAG_FUSED_EU_DISABLE = 1 << 4, }; enum rt_logical_srcs { diff --git a/src/intel/compiler/brw/brw_eu_inst.h b/src/intel/compiler/brw/brw_eu_inst.h index 77c390b5ecd..e5a49b36460 100644 --- a/src/intel/compiler/brw/brw_eu_inst.h +++ b/src/intel/compiler/brw/brw_eu_inst.h @@ -897,6 +897,7 @@ brw_eu_inst_sends_ex_desc(const struct intel_device_info *devinfo, * @{ */ F(eot, /* 9+ */ 127, 127, /* 12+ */ 34, 34) +F(fusion_ctrl, /* 9+ */ -1, -1, /* 12+ */ 33, 33) F(mlen, /* 9+ */ 124, 121, /* 12+ */ MD12(28), MD12(25)) F(rlen, /* 9+ */ 120, 116, /* 12+ */ MD12(24), MD12(20)) F(header_present, /* 9+ */ 115, 115, /* 12+ */ MD12(19), MD12(19)) diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index 5804655c7f9..f96bbbbbc93 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -6471,6 +6471,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, brw_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload, srcs, GET_BUFFER_SIZE_SRCS); inst->size_written = 4 * REG_SIZE * reg_unit(devinfo); + inst->fused_eu_disable = + (nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL) != 0; /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting: * @@ -7016,12 +7018,15 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb, (nir_intrinsic_access(instr) & ACCESS_VOLATILE); const bool coherent_access = nir_intrinsic_has_access(instr) && (nir_intrinsic_access(instr) & ACCESS_COHERENT); + const bool fused_eu_disable = nir_intrinsic_has_access(instr) && + (nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL); const unsigned align = nir_intrinsic_has_align(instr) ? nir_intrinsic_align(instr) : 0; uint8_t flags = (include_helpers ? MEMORY_FLAG_INCLUDE_HELPERS : 0) | (volatile_access ? MEMORY_FLAG_VOLATILE_ACCESS : 0) | - (coherent_access ? MEMORY_FLAG_COHERENT_ACCESS : 0); + (coherent_access ? MEMORY_FLAG_COHERENT_ACCESS : 0) | + (fused_eu_disable ? MEMORY_FLAG_FUSED_EU_DISABLE : 0); bool no_mask_handle = false; int data_src = -1; @@ -7661,6 +7666,7 @@ brw_from_nir_emit_texture(nir_to_brw_state &ntb, tex->residency = instr->is_sparse; tex->coord_components = instr->coord_components; tex->grad_components = lod_components; + tex->fused_eu_disable = (instr->backend_flags & BRW_TEX_INSTR_FUSED_EU_DISABLE) != 0; /* Wa_14012688258: * diff --git a/src/intel/compiler/brw/brw_generator.cpp b/src/intel/compiler/brw/brw_generator.cpp index d6252d3a56f..2a76d8bd821 100644 --- a/src/intel/compiler/brw/brw_generator.cpp +++ b/src/intel/compiler/brw/brw_generator.cpp @@ -198,6 +198,10 @@ brw_generator::generate_send(brw_send_inst *inst, brw_eu_inst_set_opcode(p->isa, brw_last_inst, devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); } + + /* Serialize messages if needed */ + if (devinfo->ver == 12 && inst->fused_eu_disable) + brw_eu_inst_set_fusion_ctrl(devinfo, brw_last_inst, true); } void diff --git a/src/intel/compiler/brw/brw_inst.h b/src/intel/compiler/brw/brw_inst.h index 8474a6da3fb..6ad4d786ea8 100644 --- a/src/intel/compiler/brw/brw_inst.h +++ b/src/intel/compiler/brw/brw_inst.h @@ -213,7 +213,12 @@ struct brw_inst : brw_exec_node { */ bool has_no_mask_send_params:1; - uint8_t pad:5; + /** + * Serialize the message (Gfx12.x only) + */ + bool fused_eu_disable:1; + + uint8_t pad:4; }; uint16_t bits; }; @@ -261,6 +266,11 @@ struct brw_send_inst : brw_inst { */ bool ex_bso:1; + /** + * Serialize the message (Gfx12.x only) + */ + bool fused_eu_disable:1; + /** * Only for SHADER_OPCODE_SEND, @offset field contains an immediate * part of the extended descriptor that must be encoded in the @@ -268,7 +278,7 @@ struct brw_send_inst : brw_inst { */ bool ex_desc_imm:1; - uint8_t pad:3; + uint8_t pad:2; }; uint8_t send_bits; }; @@ -279,9 +289,28 @@ struct brw_tex_inst : brw_inst { uint32_t offset; uint8_t coord_components; uint8_t grad_components; - bool residency:1; - bool surface_bindless:1; - bool sampler_bindless:1; + union { + struct { + /** + * Whether the instruction requests the residency data (additional register + * written). + */ + bool residency:1; + /** + * Serialize the message (Gfx12.x only) + */ + bool fused_eu_disable:1; + /** + * Whether the surface handle is bindless + */ + bool surface_bindless:1; + /** + * Whether the sampler handle is bindless + */ + bool sampler_bindless:1; + }; + uint8_t bits; + }; }; struct brw_mem_inst : brw_inst { diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp index 17760cdc838..10ab57705a0 100644 --- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp @@ -1217,9 +1217,12 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex) } } + const bool fused_eu_disable = tex->fused_eu_disable; + brw_send_inst *send = brw_transform_inst_to_send(bld, tex); tex = NULL; + send->fused_eu_disable = fused_eu_disable; send->mlen = mlen; send->header_size = header_size; send->sfid = BRW_SFID_SAMPLER; @@ -1481,6 +1484,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS; const bool coherent_access = mem->flags & MEMORY_FLAG_COHERENT_ACCESS; const bool has_side_effects = mem->has_side_effects(); + const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE; const uint32_t data_size_B = lsc_data_size_bytes(data_size); const enum brw_reg_type data_type = @@ -1634,6 +1638,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) send->header_size = 0; send->has_side_effects = has_side_effects; send->is_volatile = !has_side_effects || volatile_access; + send->fused_eu_disable = fused_eu_disable; /* Finally, the payload */ send->src[SEND_SRC_PAYLOAD1] = payload; @@ -1692,6 +1697,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) const bool block = mem->flags & MEMORY_FLAG_TRANSPOSE; const bool include_helpers = mem->flags & MEMORY_FLAG_INCLUDE_HELPERS; const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS; + const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE; const bool has_side_effects = mem->has_side_effects(); const bool has_dest = mem->dst.file != BAD_FILE && !mem->dst.is_null(); assert(mem->address_offset == 0); @@ -1903,6 +1909,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) send->header_size = header.file != BAD_FILE ? 1 : 0; send->has_side_effects = has_side_effects; send->is_volatile = !has_side_effects || volatile_access; + send->fused_eu_disable = fused_eu_disable; if (block) { assert(send->force_writemask_all); @@ -2447,6 +2454,7 @@ lower_get_buffer_size(const brw_builder &bld, brw_inst *inst) brw_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE]; brw_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE]; brw_reg lod = bld.move_to_vgrf(inst->src[GET_BUFFER_SIZE_SRC_LOD], 1); + const bool fused_eu_disable = inst->fused_eu_disable; brw_send_inst *send = brw_transform_inst_to_send(bld, inst); inst = NULL; @@ -2468,6 +2476,7 @@ lower_get_buffer_size(const brw_builder &bld, brw_inst *inst) send->dst = retype(send->dst, BRW_TYPE_UW); send->sfid = BRW_SFID_SAMPLER; + send->fused_eu_disable = fused_eu_disable; setup_surface_descriptors(bld, send, desc, surface, surface_handle); } diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c index 6159b47b245..78c1ccfed56 100644 --- a/src/intel/compiler/brw/brw_nir.c +++ b/src/intel/compiler/brw/brw_nir.c @@ -2065,6 +2065,86 @@ lower_txd_cb(const nir_tex_instr *tex, const void *data) return false; } +static bool +flag_fused_eu_disable_instr(nir_builder *b, nir_instr *instr, void *data) +{ + switch (instr->type) { + case nir_instr_type_tex: { + nir_tex_instr *tex = nir_instr_as_tex(instr); + + for (unsigned i = 0; i < tex->num_srcs; ++i) { + nir_tex_src_type src_type = tex->src[i].src_type; + + if (src_type != nir_tex_src_texture_handle && + src_type != nir_tex_src_sampler_handle && + src_type != nir_tex_src_texture_offset && + src_type != nir_tex_src_sampler_offset) + continue; + + if (nir_src_is_divergent(&tex->src[i].src)) { + tex->backend_flags |= BRW_TEX_INSTR_FUSED_EU_DISABLE; + return true; + } + } + return false; + } + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + /* We only need to care of intrinsics that refers to a structure/descriptor + * outside of the EU's registers like RENDER_SURFACE_STATE/SAMPLER_STATE, + * because the fusing will pick one thread's descriptor handle and use that + * for the 2 fused threads. + * + * Global pointers don't have that problem since all the access' data is + * per lane in the payload of the SEND message (the 64bit pointer). + * + * URB/shared-memory don't have that problem either because there is no + * descriptor information outside the EU, it's just a per lane + * handle/offset. + */ + switch (intrin->intrinsic) { + case nir_intrinsic_load_ssbo_uniform_block_intel: + case nir_intrinsic_load_ubo_uniform_block_intel: + case nir_intrinsic_load_ssbo_block_intel: + case nir_intrinsic_load_ssbo_intel: + case nir_intrinsic_store_ssbo_intel: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_store_ssbo: + case nir_intrinsic_get_ssbo_size: + case nir_intrinsic_load_ubo: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_size: + case nir_intrinsic_image_levels: + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_size: + case nir_intrinsic_bindless_image_levels: + case nir_intrinsic_bindless_image_atomic: + case nir_intrinsic_bindless_image_atomic_swap: { + int src_idx = nir_get_io_index_src_number(intrin); + if (nir_src_is_divergent(&intrin->src[src_idx])) { + nir_intrinsic_set_access(intrin, + nir_intrinsic_access(intrin) | + ACCESS_FUSED_EU_DISABLE_INTEL); + return true; + } + return false; + } + + default: + return false; + } + } + + default: + return false; + } +} + /* Prepare the given shader for codegen * * This function is intended to be called right before going into the actual @@ -2283,6 +2363,28 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_lower_subgroups, &subgroups_options); } + + /* Deal with EU fusion */ + if (devinfo->ver == 12) { + nir_divergence_options options = + nir_divergence_across_subgroups | + nir_divergence_multiple_workgroup_per_compute_subgroup; + + nir_foreach_function_impl(impl, nir) { + nir_divergence_analysis_impl(impl, options); + } + + nir_shader_instructions_pass(nir, + flag_fused_eu_disable_instr, + nir_metadata_all, NULL); + + /* We request a special divergence information which is not needed + * after. + */ + nir_foreach_function_impl(impl, nir) { + nir_progress(true, impl, ~nir_metadata_divergence); + } + } } void diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h index 078e5d1b21e..d9195f1c062 100644 --- a/src/intel/compiler/brw/brw_nir.h +++ b/src/intel/compiler/brw/brw_nir.h @@ -32,6 +32,8 @@ extern "C" { #endif +#define BRW_TEX_INSTR_FUSED_EU_DISABLE (1u << 30) + extern const struct nir_shader_compiler_options brw_scalar_nir_options; int type_size_vec4(const struct glsl_type *type, bool bindless); diff --git a/src/intel/compiler/brw/brw_opt_cse.cpp b/src/intel/compiler/brw/brw_opt_cse.cpp index c7983d311e4..8e2267b9479 100644 --- a/src/intel/compiler/brw/brw_opt_cse.cpp +++ b/src/intel/compiler/brw/brw_opt_cse.cpp @@ -391,9 +391,7 @@ hash_inst(const void *v) const uint8_t tex_u8data[] = { tex->coord_components, tex->grad_components, - tex->residency, - tex->surface_bindless, - tex->sampler_bindless, + tex->bits, }; const uint32_t tex_u32data[] = { tex->sampler_opcode,