diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index 7c20d5e9b4a..f3441316849 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -121,6 +121,9 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo) /* Default to the sampler since that's what we've done since forever */ compiler->indirect_ubos_use_sampler = true; + compiler->lower_dpas = devinfo->verx10 < 125 || + debug_get_bool_option("INTEL_LOWER_DPAS", false); + /* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */ for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) { compiler->scalar_stage[i] = devinfo->ver >= 8 || diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index b6cd0308c9d..bdc403c0d5a 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -128,6 +128,14 @@ struct brw_compiler { */ bool use_bindless_sampler_offset; + /** + * Should DPAS instructions be lowered? + * + * This will be set for all platforms before Gfx12.5. It may also be set + * platforms that support DPAS for testing purposes. + */ + bool lower_dpas; + /** * Calling the ra_allocate function after each register spill can take * several minutes. This option speeds up shader compilation by spilling diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index ca2d18639ae..c0937e0357b 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -6134,6 +6134,9 @@ fs_visitor::optimize() validate(); + if (compiler->lower_dpas) + OPT(brw_lower_dpas, *this); + OPT(split_virtual_grfs); /* Before anything else, eliminate dead code. The results of some NIR diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index c7af424c8fc..a57274250b9 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -626,6 +626,8 @@ void brw_emit_predicate_on_sample_mask(const brw::fs_builder &bld, fs_inst *inst int brw_get_subgroup_id_param_index(const intel_device_info *devinfo, const brw_stage_prog_data *prog_data); +bool brw_lower_dpas(fs_visitor &v); + void nir_to_brw(fs_visitor *s); #endif /* BRW_FS_H */ diff --git a/src/intel/compiler/brw_fs_lower_dpas.cpp b/src/intel/compiler/brw_fs_lower_dpas.cpp new file mode 100644 index 00000000000..306731722af --- /dev/null +++ b/src/intel/compiler/brw_fs_lower_dpas.cpp @@ -0,0 +1,306 @@ +/* + * Copyright 2023 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "brw_fs.h" +#include "brw_fs_builder.h" + +using namespace brw; + +static void +f16_using_mac(const fs_builder &bld, fs_inst *inst) +{ + /* We only intend to support configurations where the destination and + * accumulator have the same type. + */ + if (!inst->src[0].is_null()) + assert(inst->dst.type == inst->src[0].type); + + assert(inst->src[1].type == BRW_REGISTER_TYPE_HF); + assert(inst->src[2].type == BRW_REGISTER_TYPE_HF); + + const brw_reg_type src0_type = inst->dst.type; + const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF; + const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF; + + const fs_reg dest = inst->dst; + fs_reg src0 = inst->src[0]; + const fs_reg src1 = retype(inst->src[1], src1_type); + const fs_reg src2 = retype(inst->src[2], src2_type); + + const unsigned dest_stride = + dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE; + + for (unsigned r = 0; r < inst->rcount; r++) { + fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1); + + for (unsigned subword = 0; subword < 2; subword++) { + for (unsigned s = 0; s < inst->sdepth; s++) { + /* The first multiply of the dot-product operation has to + * explicitly write the accumulator register. The successive MAC + * instructions will implicitly read *and* write the + * accumulator. Those MAC instructions can also optionally + * explicitly write some other register. + * + * FINISHME: The accumulator can actually hold 16 HF values. On + * Gfx12 there are two accumulators. It should be possible to do + * this in SIMD16 or even SIMD32. I was unable to get this to work + * properly. + */ + if (s == 0 && subword == 0) { + const unsigned acc_width = 8; + fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD), + inst->group % acc_width); + + if (bld.shader->devinfo->verx10 >= 125) { + acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword); + } else { + acc = retype(acc, BRW_REGISTER_TYPE_HF); + } + + bld.MUL(acc, + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + BRW_REGISTER_TYPE_HF, subword), + component(retype(byte_offset(src2, r * REG_SIZE), + BRW_REGISTER_TYPE_HF), + s * 2 + subword)) + ->writes_accumulator = true; + + } else { + fs_reg result; + + /* As mentioned above, the MAC had an optional, explicit + * destination register. Various optimization passes are not + * clever enough to understand the intricacies of this + * instruction, so only write the result register on the final + * MAC in the sequence. + */ + if ((s + 1) == inst->sdepth && subword == 1) + result = temp; + else + result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF); + + bld.MAC(result, + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + BRW_REGISTER_TYPE_HF, subword), + component(retype(byte_offset(src2, r * REG_SIZE), + BRW_REGISTER_TYPE_HF), + s * 2 + subword)) + ->writes_accumulator = true; + } + } + } + + if (!src0.is_null()) { + if (src0_type != BRW_REGISTER_TYPE_HF) { + fs_reg temp2 = bld.vgrf(src0_type, 1); + + bld.MOV(temp2, temp); + + bld.ADD(byte_offset(dest, r * dest_stride), + temp2, + byte_offset(src0, r * dest_stride)); + } else { + bld.ADD(byte_offset(dest, r * dest_stride), + temp, + byte_offset(src0, r * dest_stride)); + } + } else { + bld.MOV(byte_offset(dest, r * dest_stride), temp); + } + } +} + +static void +int8_using_dp4a(const fs_builder &bld, fs_inst *inst) +{ + /* We only intend to support configurations where the destination and + * accumulator have the same type. + */ + if (!inst->src[0].is_null()) + assert(inst->dst.type == inst->src[0].type); + + assert(inst->src[1].type == BRW_REGISTER_TYPE_B || + inst->src[1].type == BRW_REGISTER_TYPE_UB); + assert(inst->src[2].type == BRW_REGISTER_TYPE_B || + inst->src[2].type == BRW_REGISTER_TYPE_UB); + + const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB + ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB + ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + fs_reg dest = inst->dst; + fs_reg src0 = inst->src[0]; + const fs_reg src1 = retype(inst->src[1], src1_type); + const fs_reg src2 = retype(inst->src[2], src2_type); + + const unsigned dest_stride = REG_SIZE; + + for (unsigned r = 0; r < inst->rcount; r++) { + if (!src0.is_null()) { + bld.MOV(dest, src0); + src0 = byte_offset(src0, dest_stride); + } else { + bld.MOV(dest, retype(brw_imm_d(0), dest.type)); + } + + for (unsigned s = 0; s < inst->sdepth; s++) { + bld.DP4A(dest, + dest, + byte_offset(src1, s * REG_SIZE), + component(byte_offset(src2, r * REG_SIZE), s)) + ->saturate = inst->saturate; + } + + dest = byte_offset(dest, dest_stride); + } +} + +static void +int8_using_mul_add(const fs_builder &bld, fs_inst *inst) +{ + /* We only intend to support configurations where the destination and + * accumulator have the same type. + */ + if (!inst->src[0].is_null()) + assert(inst->dst.type == inst->src[0].type); + + assert(inst->src[1].type == BRW_REGISTER_TYPE_B || + inst->src[1].type == BRW_REGISTER_TYPE_UB); + assert(inst->src[2].type == BRW_REGISTER_TYPE_B || + inst->src[2].type == BRW_REGISTER_TYPE_UB); + + const brw_reg_type src0_type = inst->dst.type; + + const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB + ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB + ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; + + fs_reg dest = inst->dst; + fs_reg src0 = inst->src[0]; + const fs_reg src1 = retype(inst->src[1], src1_type); + const fs_reg src2 = retype(inst->src[2], src2_type); + + const unsigned dest_stride = REG_SIZE; + + for (unsigned r = 0; r < inst->rcount; r++) { + if (!src0.is_null()) { + bld.MOV(dest, src0); + src0 = byte_offset(src0, dest_stride); + } else { + bld.MOV(dest, retype(brw_imm_d(0), dest.type)); + } + + for (unsigned s = 0; s < inst->sdepth; s++) { + fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + const brw_reg_type temp_type = + (inst->src[1].type == BRW_REGISTER_TYPE_B || + inst->src[2].type == BRW_REGISTER_TYPE_B) + ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW; + + /* Expand 8 dwords of packed bytes into 16 dwords of packed + * words. + * + * FINISHME: Gfx9 should not need this work around. Gfx11 + * may be able to use integer MAD. Both platforms may be + * able to use MAC. + */ + bld.group(32, 0).MOV(retype(temp3, temp_type), + retype(byte_offset(src2, r * REG_SIZE), + inst->src[2].type)); + + bld.MUL(subscript(temp1, temp_type, 0), + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + inst->src[1].type, 0), + subscript(component(retype(temp3, + BRW_REGISTER_TYPE_UD), + s * 2), + temp_type, 0)); + + bld.MUL(subscript(temp1, temp_type, 1), + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + inst->src[1].type, 1), + subscript(component(retype(temp3, + BRW_REGISTER_TYPE_UD), + s * 2), + temp_type, 1)); + + bld.MUL(subscript(temp2, temp_type, 0), + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + inst->src[1].type, 2), + subscript(component(retype(temp3, + BRW_REGISTER_TYPE_UD), + s * 2 + 1), + temp_type, 0)); + + bld.MUL(subscript(temp2, temp_type, 1), + subscript(retype(byte_offset(src1, s * REG_SIZE), + BRW_REGISTER_TYPE_UD), + inst->src[1].type, 3), + subscript(component(retype(temp3, + BRW_REGISTER_TYPE_UD), + s * 2 + 1), + temp_type, 1)); + + bld.ADD(subscript(temp1, src0_type, 0), + subscript(temp1, temp_type, 0), + subscript(temp1, temp_type, 1)); + + bld.ADD(subscript(temp2, src0_type, 0), + subscript(temp2, temp_type, 0), + subscript(temp2, temp_type, 1)); + + bld.ADD(retype(temp1, src0_type), + retype(temp1, src0_type), + retype(temp2, src0_type)); + + bld.ADD(dest, dest, retype(temp1, src0_type)) + ->saturate = inst->saturate; + } + + dest = byte_offset(dest, dest_stride); + } +} + +bool +brw_lower_dpas(fs_visitor &v) +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) { + if (inst->opcode != BRW_OPCODE_DPAS) + continue; + + const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all(); + + if (brw_reg_type_is_floating_point(inst->dst.type)) { + f16_using_mac(bld, inst); + } else { + if (v.devinfo->ver >= 12) { + int8_using_dp4a(bld, inst); + } else { + int8_using_mul_add(bld, inst); + } + } + + inst->remove(block); + progress = true; + } + + if (progress) + v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index 5fd08abd4fb..0372c5aff9f 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -57,6 +57,7 @@ libintel_compiler_files = files( 'brw_fs.h', 'brw_fs_live_variables.cpp', 'brw_fs_live_variables.h', + 'brw_fs_lower_dpas.cpp', 'brw_fs_lower_pack.cpp', 'brw_fs_lower_regioning.cpp', 'brw_fs_nir.cpp',