mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 18:00:13 +01:00
intel/fs: DPAS lowering
Implements integer dot product lowering both with and without DP4A. Implements half-float dot product lowering. There are a couple FINISHME comments describing future optimizations. v2: Add a brw_compiler::lower_dpas flag to track when the lowering should be applied. v3: Use is_null() instead of checking file != ARF. Suggested by Caio. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25994>
This commit is contained in:
parent
3cb9625539
commit
3756f60558
6 changed files with 323 additions and 0 deletions
|
|
@ -121,6 +121,9 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
|
||||||
/* Default to the sampler since that's what we've done since forever */
|
/* Default to the sampler since that's what we've done since forever */
|
||||||
compiler->indirect_ubos_use_sampler = true;
|
compiler->indirect_ubos_use_sampler = true;
|
||||||
|
|
||||||
|
compiler->lower_dpas = devinfo->verx10 < 125 ||
|
||||||
|
debug_get_bool_option("INTEL_LOWER_DPAS", false);
|
||||||
|
|
||||||
/* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
|
/* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
|
||||||
for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
|
for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
|
||||||
compiler->scalar_stage[i] = devinfo->ver >= 8 ||
|
compiler->scalar_stage[i] = devinfo->ver >= 8 ||
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,14 @@ struct brw_compiler {
|
||||||
*/
|
*/
|
||||||
bool use_bindless_sampler_offset;
|
bool use_bindless_sampler_offset;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should DPAS instructions be lowered?
|
||||||
|
*
|
||||||
|
* This will be set for all platforms before Gfx12.5. It may also be set
|
||||||
|
* platforms that support DPAS for testing purposes.
|
||||||
|
*/
|
||||||
|
bool lower_dpas;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calling the ra_allocate function after each register spill can take
|
* Calling the ra_allocate function after each register spill can take
|
||||||
* several minutes. This option speeds up shader compilation by spilling
|
* several minutes. This option speeds up shader compilation by spilling
|
||||||
|
|
|
||||||
|
|
@ -6134,6 +6134,9 @@ fs_visitor::optimize()
|
||||||
|
|
||||||
validate();
|
validate();
|
||||||
|
|
||||||
|
if (compiler->lower_dpas)
|
||||||
|
OPT(brw_lower_dpas, *this);
|
||||||
|
|
||||||
OPT(split_virtual_grfs);
|
OPT(split_virtual_grfs);
|
||||||
|
|
||||||
/* Before anything else, eliminate dead code. The results of some NIR
|
/* Before anything else, eliminate dead code. The results of some NIR
|
||||||
|
|
|
||||||
|
|
@ -626,6 +626,8 @@ void brw_emit_predicate_on_sample_mask(const brw::fs_builder &bld, fs_inst *inst
|
||||||
int brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
int brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
||||||
const brw_stage_prog_data *prog_data);
|
const brw_stage_prog_data *prog_data);
|
||||||
|
|
||||||
|
bool brw_lower_dpas(fs_visitor &v);
|
||||||
|
|
||||||
void nir_to_brw(fs_visitor *s);
|
void nir_to_brw(fs_visitor *s);
|
||||||
|
|
||||||
#endif /* BRW_FS_H */
|
#endif /* BRW_FS_H */
|
||||||
|
|
|
||||||
306
src/intel/compiler/brw_fs_lower_dpas.cpp
Normal file
306
src/intel/compiler/brw_fs_lower_dpas.cpp
Normal file
|
|
@ -0,0 +1,306 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2023 Intel Corporation
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "brw_fs.h"
|
||||||
|
#include "brw_fs_builder.h"
|
||||||
|
|
||||||
|
using namespace brw;
|
||||||
|
|
||||||
|
static void
|
||||||
|
f16_using_mac(const fs_builder &bld, fs_inst *inst)
|
||||||
|
{
|
||||||
|
/* We only intend to support configurations where the destination and
|
||||||
|
* accumulator have the same type.
|
||||||
|
*/
|
||||||
|
if (!inst->src[0].is_null())
|
||||||
|
assert(inst->dst.type == inst->src[0].type);
|
||||||
|
|
||||||
|
assert(inst->src[1].type == BRW_REGISTER_TYPE_HF);
|
||||||
|
assert(inst->src[2].type == BRW_REGISTER_TYPE_HF);
|
||||||
|
|
||||||
|
const brw_reg_type src0_type = inst->dst.type;
|
||||||
|
const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF;
|
||||||
|
const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF;
|
||||||
|
|
||||||
|
const fs_reg dest = inst->dst;
|
||||||
|
fs_reg src0 = inst->src[0];
|
||||||
|
const fs_reg src1 = retype(inst->src[1], src1_type);
|
||||||
|
const fs_reg src2 = retype(inst->src[2], src2_type);
|
||||||
|
|
||||||
|
const unsigned dest_stride =
|
||||||
|
dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE;
|
||||||
|
|
||||||
|
for (unsigned r = 0; r < inst->rcount; r++) {
|
||||||
|
fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1);
|
||||||
|
|
||||||
|
for (unsigned subword = 0; subword < 2; subword++) {
|
||||||
|
for (unsigned s = 0; s < inst->sdepth; s++) {
|
||||||
|
/* The first multiply of the dot-product operation has to
|
||||||
|
* explicitly write the accumulator register. The successive MAC
|
||||||
|
* instructions will implicitly read *and* write the
|
||||||
|
* accumulator. Those MAC instructions can also optionally
|
||||||
|
* explicitly write some other register.
|
||||||
|
*
|
||||||
|
* FINISHME: The accumulator can actually hold 16 HF values. On
|
||||||
|
* Gfx12 there are two accumulators. It should be possible to do
|
||||||
|
* this in SIMD16 or even SIMD32. I was unable to get this to work
|
||||||
|
* properly.
|
||||||
|
*/
|
||||||
|
if (s == 0 && subword == 0) {
|
||||||
|
const unsigned acc_width = 8;
|
||||||
|
fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
|
||||||
|
inst->group % acc_width);
|
||||||
|
|
||||||
|
if (bld.shader->devinfo->verx10 >= 125) {
|
||||||
|
acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword);
|
||||||
|
} else {
|
||||||
|
acc = retype(acc, BRW_REGISTER_TYPE_HF);
|
||||||
|
}
|
||||||
|
|
||||||
|
bld.MUL(acc,
|
||||||
|
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||||
|
BRW_REGISTER_TYPE_UD),
|
||||||
|
BRW_REGISTER_TYPE_HF, subword),
|
||||||
|
component(retype(byte_offset(src2, r * REG_SIZE),
|
||||||
|
BRW_REGISTER_TYPE_HF),
|
||||||
|
s * 2 + subword))
|
||||||
|
->writes_accumulator = true;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
fs_reg result;
|
||||||
|
|
||||||
|
/* As mentioned above, the MAC had an optional, explicit
|
||||||
|
* destination register. Various optimization passes are not
|
||||||
|
* clever enough to understand the intricacies of this
|
||||||
|
* instruction, so only write the result register on the final
|
||||||
|
* MAC in the sequence.
|
||||||
|
*/
|
||||||
|
if ((s + 1) == inst->sdepth && subword == 1)
|
||||||
|
result = temp;
|
||||||
|
else
|
||||||
|
result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF);
|
||||||
|
|
||||||
|
bld.MAC(result,
|
||||||
|
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||||
|
BRW_REGISTER_TYPE_UD),
|
||||||
|
BRW_REGISTER_TYPE_HF, subword),
|
||||||
|
component(retype(byte_offset(src2, r * REG_SIZE),
|
||||||
|
BRW_REGISTER_TYPE_HF),
|
||||||
|
s * 2 + subword))
|
||||||
|
->writes_accumulator = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!src0.is_null()) {
|
||||||
|
if (src0_type != BRW_REGISTER_TYPE_HF) {
|
||||||
|
fs_reg temp2 = bld.vgrf(src0_type, 1);
|
||||||
|
|
||||||
|
bld.MOV(temp2, temp);
|
||||||
|
|
||||||
|
bld.ADD(byte_offset(dest, r * dest_stride),
|
||||||
|
temp2,
|
||||||
|
byte_offset(src0, r * dest_stride));
|
||||||
|
} else {
|
||||||
|
bld.ADD(byte_offset(dest, r * dest_stride),
|
||||||
|
temp,
|
||||||
|
byte_offset(src0, r * dest_stride));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
bld.MOV(byte_offset(dest, r * dest_stride), temp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
int8_using_dp4a(const fs_builder &bld, fs_inst *inst)
|
||||||
|
{
|
||||||
|
/* We only intend to support configurations where the destination and
|
||||||
|
* accumulator have the same type.
|
||||||
|
*/
|
||||||
|
if (!inst->src[0].is_null())
|
||||||
|
assert(inst->dst.type == inst->src[0].type);
|
||||||
|
|
||||||
|
assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
|
||||||
|
inst->src[1].type == BRW_REGISTER_TYPE_UB);
|
||||||
|
assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
|
||||||
|
inst->src[2].type == BRW_REGISTER_TYPE_UB);
|
||||||
|
|
||||||
|
const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
|
||||||
|
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
|
||||||
|
|
||||||
|
const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
|
||||||
|
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
|
||||||
|
|
||||||
|
fs_reg dest = inst->dst;
|
||||||
|
fs_reg src0 = inst->src[0];
|
||||||
|
const fs_reg src1 = retype(inst->src[1], src1_type);
|
||||||
|
const fs_reg src2 = retype(inst->src[2], src2_type);
|
||||||
|
|
||||||
|
const unsigned dest_stride = REG_SIZE;
|
||||||
|
|
||||||
|
for (unsigned r = 0; r < inst->rcount; r++) {
|
||||||
|
if (!src0.is_null()) {
|
||||||
|
bld.MOV(dest, src0);
|
||||||
|
src0 = byte_offset(src0, dest_stride);
|
||||||
|
} else {
|
||||||
|
bld.MOV(dest, retype(brw_imm_d(0), dest.type));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned s = 0; s < inst->sdepth; s++) {
|
||||||
|
bld.DP4A(dest,
|
||||||
|
dest,
|
||||||
|
byte_offset(src1, s * REG_SIZE),
|
||||||
|
component(byte_offset(src2, r * REG_SIZE), s))
|
||||||
|
->saturate = inst->saturate;
|
||||||
|
}
|
||||||
|
|
||||||
|
dest = byte_offset(dest, dest_stride);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
int8_using_mul_add(const fs_builder &bld, fs_inst *inst)
|
||||||
|
{
|
||||||
|
/* We only intend to support configurations where the destination and
|
||||||
|
* accumulator have the same type.
|
||||||
|
*/
|
||||||
|
if (!inst->src[0].is_null())
|
||||||
|
assert(inst->dst.type == inst->src[0].type);
|
||||||
|
|
||||||
|
assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
|
||||||
|
inst->src[1].type == BRW_REGISTER_TYPE_UB);
|
||||||
|
assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
|
||||||
|
inst->src[2].type == BRW_REGISTER_TYPE_UB);
|
||||||
|
|
||||||
|
const brw_reg_type src0_type = inst->dst.type;
|
||||||
|
|
||||||
|
const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
|
||||||
|
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
|
||||||
|
|
||||||
|
const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
|
||||||
|
? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
|
||||||
|
|
||||||
|
fs_reg dest = inst->dst;
|
||||||
|
fs_reg src0 = inst->src[0];
|
||||||
|
const fs_reg src1 = retype(inst->src[1], src1_type);
|
||||||
|
const fs_reg src2 = retype(inst->src[2], src2_type);
|
||||||
|
|
||||||
|
const unsigned dest_stride = REG_SIZE;
|
||||||
|
|
||||||
|
for (unsigned r = 0; r < inst->rcount; r++) {
|
||||||
|
if (!src0.is_null()) {
|
||||||
|
bld.MOV(dest, src0);
|
||||||
|
src0 = byte_offset(src0, dest_stride);
|
||||||
|
} else {
|
||||||
|
bld.MOV(dest, retype(brw_imm_d(0), dest.type));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned s = 0; s < inst->sdepth; s++) {
|
||||||
|
fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||||
|
fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||||
|
fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
|
||||||
|
const brw_reg_type temp_type =
|
||||||
|
(inst->src[1].type == BRW_REGISTER_TYPE_B ||
|
||||||
|
inst->src[2].type == BRW_REGISTER_TYPE_B)
|
||||||
|
? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW;
|
||||||
|
|
||||||
|
/* Expand 8 dwords of packed bytes into 16 dwords of packed
|
||||||
|
* words.
|
||||||
|
*
|
||||||
|
* FINISHME: Gfx9 should not need this work around. Gfx11
|
||||||
|
* may be able to use integer MAD. Both platforms may be
|
||||||
|
* able to use MAC.
|
||||||
|
*/
|
||||||
|
bld.group(32, 0).MOV(retype(temp3, temp_type),
|
||||||
|
retype(byte_offset(src2, r * REG_SIZE),
|
||||||
|
inst->src[2].type));
|
||||||
|
|
||||||
|
bld.MUL(subscript(temp1, temp_type, 0),
|
||||||
|
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||||
|
BRW_REGISTER_TYPE_UD),
|
||||||
|
inst->src[1].type, 0),
|
||||||
|
subscript(component(retype(temp3,
|
||||||
|
BRW_REGISTER_TYPE_UD),
|
||||||
|
s * 2),
|
||||||
|
temp_type, 0));
|
||||||
|
|
||||||
|
bld.MUL(subscript(temp1, temp_type, 1),
|
||||||
|
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||||
|
BRW_REGISTER_TYPE_UD),
|
||||||
|
inst->src[1].type, 1),
|
||||||
|
subscript(component(retype(temp3,
|
||||||
|
BRW_REGISTER_TYPE_UD),
|
||||||
|
s * 2),
|
||||||
|
temp_type, 1));
|
||||||
|
|
||||||
|
bld.MUL(subscript(temp2, temp_type, 0),
|
||||||
|
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||||
|
BRW_REGISTER_TYPE_UD),
|
||||||
|
inst->src[1].type, 2),
|
||||||
|
subscript(component(retype(temp3,
|
||||||
|
BRW_REGISTER_TYPE_UD),
|
||||||
|
s * 2 + 1),
|
||||||
|
temp_type, 0));
|
||||||
|
|
||||||
|
bld.MUL(subscript(temp2, temp_type, 1),
|
||||||
|
subscript(retype(byte_offset(src1, s * REG_SIZE),
|
||||||
|
BRW_REGISTER_TYPE_UD),
|
||||||
|
inst->src[1].type, 3),
|
||||||
|
subscript(component(retype(temp3,
|
||||||
|
BRW_REGISTER_TYPE_UD),
|
||||||
|
s * 2 + 1),
|
||||||
|
temp_type, 1));
|
||||||
|
|
||||||
|
bld.ADD(subscript(temp1, src0_type, 0),
|
||||||
|
subscript(temp1, temp_type, 0),
|
||||||
|
subscript(temp1, temp_type, 1));
|
||||||
|
|
||||||
|
bld.ADD(subscript(temp2, src0_type, 0),
|
||||||
|
subscript(temp2, temp_type, 0),
|
||||||
|
subscript(temp2, temp_type, 1));
|
||||||
|
|
||||||
|
bld.ADD(retype(temp1, src0_type),
|
||||||
|
retype(temp1, src0_type),
|
||||||
|
retype(temp2, src0_type));
|
||||||
|
|
||||||
|
bld.ADD(dest, dest, retype(temp1, src0_type))
|
||||||
|
->saturate = inst->saturate;
|
||||||
|
}
|
||||||
|
|
||||||
|
dest = byte_offset(dest, dest_stride);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
brw_lower_dpas(fs_visitor &v)
|
||||||
|
{
|
||||||
|
bool progress = false;
|
||||||
|
|
||||||
|
foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) {
|
||||||
|
if (inst->opcode != BRW_OPCODE_DPAS)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all();
|
||||||
|
|
||||||
|
if (brw_reg_type_is_floating_point(inst->dst.type)) {
|
||||||
|
f16_using_mac(bld, inst);
|
||||||
|
} else {
|
||||||
|
if (v.devinfo->ver >= 12) {
|
||||||
|
int8_using_dp4a(bld, inst);
|
||||||
|
} else {
|
||||||
|
int8_using_mul_add(bld, inst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inst->remove(block);
|
||||||
|
progress = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (progress)
|
||||||
|
v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||||
|
|
||||||
|
return progress;
|
||||||
|
}
|
||||||
|
|
@ -57,6 +57,7 @@ libintel_compiler_files = files(
|
||||||
'brw_fs.h',
|
'brw_fs.h',
|
||||||
'brw_fs_live_variables.cpp',
|
'brw_fs_live_variables.cpp',
|
||||||
'brw_fs_live_variables.h',
|
'brw_fs_live_variables.h',
|
||||||
|
'brw_fs_lower_dpas.cpp',
|
||||||
'brw_fs_lower_pack.cpp',
|
'brw_fs_lower_pack.cpp',
|
||||||
'brw_fs_lower_regioning.cpp',
|
'brw_fs_lower_regioning.cpp',
|
||||||
'brw_fs_nir.cpp',
|
'brw_fs_nir.cpp',
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue