diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 19b0dc6c111..227604c8210 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -947,6 +947,10 @@ enum a64_logical_srcs { A64_LOGICAL_SRC, /** Per-opcode immediate argument. Number of dwords, bit size, or atomic op. */ A64_LOGICAL_ARG, + /** + * Some instructions do want to run on helper lanes (like ray queries). + */ + A64_LOGICAL_ENABLE_HELPERS, A64_LOGICAL_NUM_SRCS }; diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index c68e71c19f5..3bd002726ad 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5475,6 +5475,40 @@ emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst) } } +/** + * Predicate the specified instruction on the vector mask. + */ +static void +emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst) +{ + assert(bld.shader->stage == MESA_SHADER_FRAGMENT && + bld.group() == inst->group && + bld.dispatch_width() == inst->exec_size); + + const fs_builder ubld = bld.exec_all().group(1, 0); + + const fs_visitor *v = static_cast(bld.shader); + const fs_reg vector_mask = ubld.vgrf(BRW_REGISTER_TYPE_UW); + ubld.emit(SHADER_OPCODE_READ_SR_REG, vector_mask, brw_imm_ud(3)); + const unsigned subreg = sample_mask_flag_subreg(v); + + ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask); + + if (inst->predicate) { + assert(inst->predicate == BRW_PREDICATE_NORMAL); + assert(!inst->predicate_inverse); + assert(inst->flag_subreg == 0); + /* Combine the vector mask with the existing predicate by using a + * vertical predication mode. + */ + inst->predicate = BRW_PREDICATE_ALIGN1_ALLV; + } else { + inst->flag_subreg = subreg; + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = false; + } +} + static void setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc, const fs_reg &surface, const fs_reg &surface_handle) @@ -6068,6 +6102,26 @@ emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr) return header; } +static void +emit_fragment_mask(const fs_builder &bld, fs_inst *inst) +{ + assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM); + const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud; + + /* If we're a fragment shader, we have to predicate with the sample mask to + * avoid helper invocations to avoid helper invocations in instructions + * with side effects, unless they are explicitly required. + * + * There are also special cases when we actually want to run on helpers + * (ray queries). + */ + assert(bld.shader->stage == MESA_SHADER_FRAGMENT); + if (enable_helpers) + emit_predicate_on_vector_mask(bld, inst); + else if (inst->has_side_effects()) + emit_predicate_on_sample_mask(bld, inst); +} + static void lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst) { @@ -6083,12 +6137,6 @@ lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst) const unsigned arg = inst->src[A64_LOGICAL_ARG].ud; const bool has_side_effects = inst->has_side_effects(); - /* If the surface message has side effects and we're a fragment shader, we - * have to predicate with the sample mask to avoid helper invocations. - */ - if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) - emit_predicate_on_sample_mask(bld, inst); - fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD); fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps), BRW_REGISTER_TYPE_UD); @@ -6164,6 +6212,9 @@ lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst) unreachable("Unknown A64 logical instruction"); } + if (bld.shader->stage == MESA_SHADER_FRAGMENT) + emit_fragment_mask(bld, inst); + /* Update the original instruction. */ inst->opcode = SHADER_OPCODE_SEND; inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); @@ -6193,12 +6244,6 @@ lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) const unsigned arg = inst->src[A64_LOGICAL_ARG].ud; const bool has_side_effects = inst->has_side_effects(); - /* If the surface message has side effects and we're a fragment shader, we - * have to predicate with the sample mask to avoid helper invocations. - */ - if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) - emit_predicate_on_sample_mask(bld, inst); - fs_reg payload, payload2; unsigned mlen, ex_mlen = 0, header_size = 0; if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL || @@ -6322,6 +6367,9 @@ lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) unreachable("Unknown A64 logical instruction"); } + if (bld.shader->stage == MESA_SHADER_FRAGMENT) + emit_fragment_mask(bld, inst); + /* Update the original instruction. */ inst->opcode = SHADER_OPCODE_SEND; inst->mlen = mlen; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 5fec0e7963d..c5897f45eb3 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4792,6 +4792,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg srcs[A64_LOGICAL_NUM_SRCS]; srcs[A64_LOGICAL_ADDRESS] = get_nir_src(instr->src[0]); srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ + srcs[A64_LOGICAL_ENABLE_HELPERS] = + brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS); if (nir_dest_bit_size(instr->dest) == 32 && nir_intrinsic_align(instr) >= 4) { @@ -4828,6 +4830,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg srcs[A64_LOGICAL_NUM_SRCS]; srcs[A64_LOGICAL_ADDRESS] = get_nir_src(instr->src[1]); + srcs[A64_LOGICAL_ENABLE_HELPERS] = + brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS); if (nir_src_bit_size(instr->src[0]) == 32 && nir_intrinsic_align(instr) >= 4) { @@ -4912,6 +4916,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[A64_LOGICAL_ADDRESS] = addr; srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components); + /* This intrinsic loads memory from a uniform address, sometimes + * shared across lanes. We never need to mask it. + */ + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL, load_val, srcs, A64_LOGICAL_NUM_SRCS); @@ -5616,6 +5624,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[A64_LOGICAL_ADDRESS] = address; srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */ srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(1); ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD), srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes; @@ -5650,6 +5659,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD); srcs[A64_LOGICAL_ARG] = brw_imm_ud(block); + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); const fs_builder &ubld = block == 8 ? ubld8 : ubld16; ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, fs_reg(), @@ -6054,6 +6064,7 @@ fs_visitor::nir_emit_global_atomic(const fs_builder &bld, srcs[A64_LOGICAL_ADDRESS] = addr; srcs[A64_LOGICAL_SRC] = data; srcs[A64_LOGICAL_ARG] = brw_imm_ud(op); + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); switch (nir_dest_bit_size(instr->dest)) { case 16: { @@ -6102,6 +6113,7 @@ fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld, srcs[A64_LOGICAL_ADDRESS] = addr; srcs[A64_LOGICAL_SRC] = data; srcs[A64_LOGICAL_ARG] = brw_imm_ud(op); + srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); switch (nir_dest_bit_size(instr->dest)) { case 16: {